Add: dp4a & umul24 instructions

ashvardanian · ashvardanian · commit ce1e3b7d03b8 · 2025-02-12T11:21:55.000Z
diff --git a/less_slow.cpp b/less_slow.cpp
@@ -2137,6 +2137,8 @@ extern __global__ void tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel();
 extern __global__ void tops_f64f64_sm60fma_16x16x16_loop128_cuda_kernel();
 extern __global__ void tops_i32i32_sm60fma_16x16x16_loop128_cuda_kernel();
 extern __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel();
+extern __global__ void tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel();
+extern __global__ void tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel();
 
 BENCHMARK_CAPTURE(                                                                         //
     theoretic_tops_cuda, f32f32_sm60fma, tops_f32f32_sm60fma_16x16x16_loop128_cuda_kernel, //
@@ -2154,6 +2156,14 @@ BENCHMARK_CAPTURE(
     theoretic_tops_cuda, i64i64_sm60fma, tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel, //
     16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
     ->MinTime(10);
+BENCHMARK_CAPTURE(                                                                       //
+    theoretic_tops_cuda, u8u32_sm60fma, tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel, //
+    16, 16, 64, 60, 128, tensor_core_scale_t::single_k)
+    ->MinTime(10);
+BENCHMARK_CAPTURE(                                                                         //
+    theoretic_tops_cuda, u24u32_sm60fma, tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel, //
+    16, 16, 16, 60, 128, tensor_core_scale_t::single_k)
+    ->MinTime(10);
 
 extern __global__ void tops_f16f16_sm70fma_16x16x16_loop128_cuda_kernel();
 extern __global__ void tops_f16f16_sm70wmma_16x16x16_loop128_cuda_kernel();
diff --git a/less_slow.cu b/less_slow.cu
@@ -157,7 +157,8 @@ __device__ void tops_fma_cuda_kernel() {
         for (int i = 0; i < matrix_side_; ++i)
             for (int j = 0; j < matrix_side_; ++j)
                 for (int k = 0; k < matrix_side_; ++k)
-                    c_tile[i][j] = fma_operator(a_tile[i][k], b_tile[k][j], c_tile[i][j]);
+                    // Assume the second matrix is transposed
+                    c_tile[i][j] = fma_operator(a_tile[i][k], b_tile[j][k], c_tile[i][j]);
     }
 
     // Prevent dead-code elimination by writing one result out
@@ -206,6 +207,20 @@ __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel() {
     tops_fma_cuda_kernel<std::int64_t, std::int64_t, 16, 128>();
 }
 
+__global__ void tops_u8u32_sm60fma_16x16x64_loop128_cuda_kernel() {
+    struct dp4a_t {
+        inline __device__ uint operator()(uint a, uint b, uint c) const noexcept { return __dp4a(a, b, c); }
+    };
+    tops_fma_cuda_kernel<uint, uint, 16, 128, dp4a_t>();
+}
+
+__global__ void tops_u24u32_sm60fma_16x16x16_loop128_cuda_kernel() {
+    struct umul24_t {
+        inline __device__ uint operator()(uint a, uint b, uint c) const noexcept { return __umul24(a, b) + c; }
+    };
+    tops_fma_cuda_kernel<uint, uint, 16, 128, umul24_t>();
+}
+
 /**
  *  Given the growing demand for such workloads, new Dynamic Programming
  *  eXtensions @b (DPX) have been added on Hopper for various combinations