|
| 1 | +- {'M': 4864, 'N': 4096, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 2 | +- {'M': 4864, 'N': 4096, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 3 | +- {'M': 4864, 'N': 4096, 'K': 4224, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 4 | +- {'M': 4864, 'N': 4096, 'K': 4288, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 5 | +- {'M': 4864, 'N': 4096, 'K': 4097, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 6 | +- {'M': 4864, 'N': 4096, 'K': 4098, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 7 | +- {'M': 4864, 'N': 4096, 'K': 4100, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 8 | +- {'M': 4864, 'N': 4096, 'K': 4104, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 9 | +- {'M': 4864, 'N': 4096, 'K': 4112, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 10 | +- {'M': 4864, 'N': 8192, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 11 | +- {'M': 4864, 'N': 8192, 'K': 4160, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 12 | +- {'M': 4864, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 13 | +- {'M': 4864, 'N': 8192, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 14 | +- {'M': 1024, 'N': 1024, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 15 | +- {'M': 1024, 'N': 1024, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 16 | +- {'M': 8192, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 17 | +- {'M': 8192, 'N': 8192, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 18 | +- {'M': 4864, 'N': 4096, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 19 | +- {'M': 4864, 'N': 4096, 'K': 8256, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 20 | +- {'M': 1024, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 21 | +- {'M': 1024, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 2, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'} |
| 22 | +- {'M': 1024, 'N': 28672, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 23 | +- {'M': 1024, 'N': 28672, 'K': 8192, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 24 | +- {'M': 1024, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 25 | +- {'M': 1024, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 2, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 26 | +- {'M': 1, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 27 | +- {'M': 1, 'N': 8192, 'K': 28672, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 1, 'SPLIT_K': 4, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 28 | +- {'M': 1, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 2, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 29 | +- {'M': 1, 'N': 14336, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 30 | +- {'M': 1024, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 31 | +- {'M': 1024, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 32 | +- {'M': 1024, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 33 | +- {'M': 1024, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 34 | +- {'M': 32, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 35 | +- {'M': 32, 'N': 16384, 'K': 53248, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 36 | +- {'M': 32, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 37 | +- {'M': 32, 'N': 53248, 'K': 16384, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'} |
| 38 | +- {'M': 2, 'N': 3584, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 39 | +- {'M': 2, 'N': 3584, 'K': 4096, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 8, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 40 | +- {'M': 2, 'N': 4096, 'K': 1792, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 16, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 41 | +- {'M': 2, 'N': 4096, 'K': 1792, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 16, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 4, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 42 | +- {'M': 4096, 'N': 13312, 'K': 8896, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 43 | +- {'M': 4096, 'N': 13312, 'K': 8896, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 44 | +- {'M': 2048, 'N': 17792, 'K': 13312, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 45 | +- {'M': 2048, 'N': 17792, 'K': 13312, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'} |
| 46 | +- {'M': 1024, 'N': 13312, 'K': 1664, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 47 | +- {'M': 1024, 'N': 13312, 'K': 1664, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 48 | +- {'M': 8192, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 49 | +- {'M': 8192, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 50 | +- {'M': 8192, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 51 | +- {'M': 8192, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 1, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 52 | +- {'M': 8192, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 1, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 53 | +- {'M': 8192, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 54 | +- {'M': 32768, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 55 | +- {'M': 32768, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 16, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 56 | +- {'M': 512, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 57 | +- {'M': 512, 'N': 1536, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 32, 'instruction_sched_variant': 'none'} |
| 58 | +- {'M': 512, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 59 | +- {'M': 512, 'N': 5120, 'K': 1024, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 2, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 60 | +- {'M': 512, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 256, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 61 | +- {'M': 512, 'N': 1024, 'K': 5120, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 128, 'GROUP_SIZE_M': 4, 'SPLIT_K': 1, 'num_warps': 4, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 62 | +- {'M': 2048, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'N', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
| 63 | +- {'M': 2048, 'N': 5120, 'K': 512, 'rowMajorA': 'T', 'rowMajorB': 'T', 'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8, 'SPLIT_K': 1, 'num_warps': 8, 'num_stages': 2, 'waves_per_eu': 0, 'kpack': 2, 'matrix_instr_nonkdim': 16, 'instruction_sched_variant': 'none'} |
0 commit comments