Skip to content

Commit ce1e3b7

Browse files
committed
Add: dp4a & umul24 instructions
1 parent 22f52c4 commit ce1e3b7

File tree

2 files changed

+26
-1
lines changed

2 files changed

+26
-1
lines changed

less_slow.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2137,6 +2137,8 @@ extern __global__ void tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel();
21372137
extern __global__ void tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel();
21382138
extern __global__ void tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel();
21392139
extern __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel();
2140+
extern __global__ void tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel();
2141+
extern __global__ void tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel();
21402142

21412143
BENCHMARK_CAPTURE( //
21422144
theoretic_tops_cuda, f32f32_sm60fma, tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel, //
@@ -2154,6 +2156,14 @@ BENCHMARK_CAPTURE(
21542156
theoretic_tops_cuda, i64i64_sm60fma, tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel, //
21552157
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
21562158
->MinTime(10);
2159+
BENCHMARK_CAPTURE( //
2160+
theoretic_tops_cuda, u8u32_sm60fma, tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel, //
2161+
16, 16, 64, 60, 128, tensor_core_scale_t::single_k)
2162+
->MinTime(10);
2163+
BENCHMARK_CAPTURE( //
2164+
theoretic_tops_cuda, u24u32_sm60fma, tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel, //
2165+
16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
2166+
->MinTime(10);
21572167

21582168
extern __global__ void tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel();
21592169
extern __global__ void tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel();

less_slow.cu

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,8 @@ __device__ void tops_fma_cuda_kernel() {
157157
for (int i = 0; i < matrix_side_; ++i)
158158
for (int j = 0; j < matrix_side_; ++j)
159159
for (int k = 0; k < matrix_side_; ++k)
160-
c_tile[i][j] = fma_operator(a_tile[i][k], b_tile[k][j], c_tile[i][j]);
160+
// Assume the second matrix is transposed
161+
c_tile[i][j] = fma_operator(a_tile[i][k], b_tile[j][k], c_tile[i][j]);
161162
}
162163

163164
// Prevent dead-code elimination by writing one result out
@@ -206,6 +207,20 @@ __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel() {
206207
tops_fma_cuda_kernel<std::int64_t, std::int64_t, 16, 128>();
207208
}
208209

210+
__global__ void tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel() {
211+
struct dp4a_t {
212+
inline __device__ uint operator()(uint a, uint b, uint c) const noexcept { return __dp4a(a, b, c); }
213+
};
214+
tops_fma_cuda_kernel<uint, uint, 16, 128, dp4a_t>();
215+
}
216+
217+
__global__ void tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel() {
218+
struct umul24_t {
219+
inline __device__ uint operator()(uint a, uint b, uint c) const noexcept { return __umul24(a, b) + c; }
220+
};
221+
tops_fma_cuda_kernel<uint, uint, 16, 128, umul24_t>();
222+
}
223+
209224
/**
210225
* Given the growing demand for such workloads, new Dynamic Programming
211226
* eXtensions @b (DPX) have been added on Hopper for various combinations

0 commit comments

Comments
 (0)