Improve: Shrink PTX loops

ashvardanian · ashvardanian · commit 152e59acb3d6 · 2025-02-07T22:24:00.000Z
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ build_release/
 .DS_Store
 
 # Temporary binaries
+/tmp/
 less_slow_from_ptx.cubin
 less_slow_from_cu.cubin
 less_slow_from_cu.ptx
diff --git a/less_slow.cpp b/less_slow.cpp
@@ -2080,53 +2080,53 @@ static void theoretic_tops_cuda(                   //
     state.counters["TOP"] = benchmark::Counter(tops_per_gpu * state.iterations(), benchmark::Counter::kIsRate);
 }
 
-extern __global__ void tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel();
-extern __global__ void tops_f16f32_sm70tc_16x16x16_1024unroll_cuda_kernel();
+extern __global__ void tops_f16f16_sm70tc_16x16x16_loop128_cuda_kernel();
+extern __global__ void tops_f16f32_sm70tc_16x16x16_loop128_cuda_kernel();
 
-extern __global__ void tops_u8i32_sm75tc_16x16x16_1024unroll_cuda_kernel();
-extern __global__ void tops_u4i32_sm75tc_8x8x32_1024unroll_cuda_kernel();
-extern __global__ void tops_b1i32xor_sm75tc_8x8x128_1024unroll_cuda_kernel();
+extern __global__ void tops_u8i32_sm75tc_16x16x16_loop128_cuda_kernel();
+extern __global__ void tops_u4i32_sm75tc_8x8x32_loop128_cuda_kernel();
+extern __global__ void tops_b1i32xor_sm75tc_8x8x128_loop128_cuda_kernel();
 
-extern __global__ void tops_bf16f32_sm80tc_16x16x16_1024unroll_cuda_kernel();
-extern __global__ void tops_tf32f32_sm80tc_16x16x8_1024unroll_cuda_kernel();
-extern __global__ void tops_f64f64_sm80tc_8x8x4_1024unroll_cuda_kernel();
-extern __global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel();
+extern __global__ void tops_bf16f32_sm80tc_16x16x16_loop128_cuda_kernel();
+extern __global__ void tops_tf32f32_sm80tc_16x16x8_loop128_cuda_kernel();
+extern __global__ void tops_f64f64_sm80tc_8x8x4_loop128_cuda_kernel();
+extern __global__ void tops_b1i32and_sm80tc_8x8x128_loop128_cuda_kernel();
 
-BENCHMARK_CAPTURE(                                                                          //
-    theoretic_tops_cuda, f16f16_sm70tc, tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel, //
-    16, 16, 16, 1024, 70)
+BENCHMARK_CAPTURE(                                                                       //
+    theoretic_tops_cuda, f16f16_sm70tc, tops_f16f16_sm70tc_16x16x16_loop128_cuda_kernel, //
+    16, 16, 16, 128, 70)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                          //
-    theoretic_tops_cuda, f16f32_sm70tc, tops_f16f32_sm70tc_16x16x16_1024unroll_cuda_kernel, //
-    16, 16, 16, 1024, 70)
+BENCHMARK_CAPTURE(                                                                       //
+    theoretic_tops_cuda, f16f32_sm70tc, tops_f16f32_sm70tc_16x16x16_loop128_cuda_kernel, //
+    16, 16, 16, 128, 70)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                        //
-    theoretic_tops_cuda, u8i32_sm75tc, tops_u8i32_sm75tc_16x16x16_1024unroll_cuda_kernel, //
-    16, 16, 16, 1024, 75)
+BENCHMARK_CAPTURE(                                                                     //
+    theoretic_tops_cuda, u8i32_sm75tc, tops_u8i32_sm75tc_16x16x16_loop128_cuda_kernel, //
+    16, 16, 16, 128, 75)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                      //
-    theoretic_tops_cuda, u4i32_sm75tc, tops_u4i32_sm75tc_8x8x32_1024unroll_cuda_kernel, //
-    8, 8, 32, 1024, 75)
+BENCHMARK_CAPTURE(                                                                   //
+    theoretic_tops_cuda, u4i32_sm75tc, tops_u4i32_sm75tc_8x8x32_loop128_cuda_kernel, //
+    8, 8, 32, 128, 75)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                             //
-    theoretic_tops_cuda, b1i32xor_sm75tc, tops_b1i32xor_sm75tc_8x8x128_1024unroll_cuda_kernel, //
-    8, 8, 128, 1024, 75)
+BENCHMARK_CAPTURE(                                                                          //
+    theoretic_tops_cuda, b1i32xor_sm75tc, tops_b1i32xor_sm75tc_8x8x128_loop128_cuda_kernel, //
+    8, 8, 128, 128, 75)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                            //
-    theoretic_tops_cuda, bf16f32_sm80tc, tops_bf16f32_sm80tc_16x16x16_1024unroll_cuda_kernel, //
-    16, 16, 16, 1024, 80)
+BENCHMARK_CAPTURE(                                                                         //
+    theoretic_tops_cuda, bf16f32_sm80tc, tops_bf16f32_sm80tc_16x16x16_loop128_cuda_kernel, //
+    16, 16, 16, 128, 80)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                           //
-    theoretic_tops_cuda, tf32f32_sm80tc, tops_tf32f32_sm80tc_16x16x8_1024unroll_cuda_kernel, //
-    16, 16, 8, 1024, 80)
+BENCHMARK_CAPTURE(                                                                        //
+    theoretic_tops_cuda, tf32f32_sm80tc, tops_tf32f32_sm80tc_16x16x8_loop128_cuda_kernel, //
+    16, 16, 8, 128, 80)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                       //
-    theoretic_tops_cuda, f64f64_sm80tc, tops_f64f64_sm80tc_8x8x4_1024unroll_cuda_kernel, //
-    8, 8, 4, 1024, 80)
+BENCHMARK_CAPTURE(                                                                    //
+    theoretic_tops_cuda, f64f64_sm80tc, tops_f64f64_sm80tc_8x8x4_loop128_cuda_kernel, //
+    8, 8, 4, 128, 80)
     ->MinTime(10);
-BENCHMARK_CAPTURE(                                                                             //
-    theoretic_tops_cuda, b1i32and_sm80tc, tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel, //
-    8, 8, 128, 1024, 80)
+BENCHMARK_CAPTURE(                                                                          //
+    theoretic_tops_cuda, b1i32and_sm80tc, tops_b1i32and_sm80tc_8x8x128_loop128_cuda_kernel, //
+    8, 8, 128, 128, 80)
     ->MinTime(10);
 
 #include <filesystem>
@@ -2202,7 +2202,10 @@ static void theoretic_tops_ptx(                  //
         return;
     }
 
-    // Load the PTX file
+    // Load the PTX file and JIT it!
+    // If the compilation is taking long, consider using the `CUDA_CACHE_PATH`
+    // environment variable to cache already compiled modules:
+    // https://developer.nvidia.com/blog/cuda-pro-tip-understand-fat-binaries-jit-caching/
     result = cuModuleLoad(&module_, ptx_file.c_str());
     if (result != CUDA_SUCCESS) {
         state.SkipWithError("Failed to load PTX file: " + last_error_string());
@@ -2261,46 +2264,46 @@ static void theoretic_tops_ptx(                  //
     cuCtxDestroy(context);
 }
 
+BENCHMARK_CAPTURE(                                                          //
+    theoretic_tops_ptx, f16f16_sm70tc,                                      //
+    "less_slow_sm70.ptx", "tops_f16f16_sm70tc_16x16x16_loop128_ptx_kernel", //
+    16, 16, 16, 128, 70)
+    ->MinTime(10);
+
 BENCHMARK_CAPTURE(                                                           //
-    theoretic_tops_ptx, f16f16_sm70tc,                                       //
-    "less_slow_sm70.ptx", "tops_f16f16_sm70tc_16x16x16_1024loop_ptx_kernel", //
-    16, 16, 16, 1024, 70)
+    theoretic_tops_ptx, f16f16_sm90tc,                                       //
+    "less_slow_sm90a.ptx", "tops_f16f16_sm90tc_16x16x16_loop128_ptx_kernel", //
+    16, 16, 16, 128, 90)
     ->MinTime(10);
 
-BENCHMARK_CAPTURE(                                                            //
-    theoretic_tops_ptx, f16f16_sm90tc,                                        //
-    "less_slow_sm90a.ptx", "tops_f16f16_sm90tc_16x16x16_1024loop_ptx_kernel", //
-    16, 16, 16, 1024, 90)
+BENCHMARK_CAPTURE(                                                        //
+    theoretic_tops_ptx, f64f64_sm90tc,                                    //
+    "less_slow_sm90a.ptx", "tops_f64f64_sm90tc_8x8x4_loop128_ptx_kernel", //
+    8, 8, 4, 128, 90)
     ->MinTime(10);
 
-BENCHMARK_CAPTURE(                                                         //
-    theoretic_tops_ptx, f64f64_sm90tc,                                     //
-    "less_slow_sm90a.ptx", "tops_f64f64_sm90tc_8x8x4_1024loop_ptx_kernel", //
-    8, 8, 4, 1024, 90)
+BENCHMARK_CAPTURE(                                                           //
+    theoretic_tops_ptx, tf32f32_sm90tc,                                      //
+    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_16x16x8_loop128_ptx_kernel", //
+    16, 16, 8, 128, 90)
     ->MinTime(10);
 
 BENCHMARK_CAPTURE(                                                            //
-    theoretic_tops_ptx, tf32f32_sm90tc,                                       //
-    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_16x16x8_1024loop_ptx_kernel", //
-    16, 16, 8, 1024, 90)
+    theoretic_tops_ptx, tf32f32_sm90tc_wgmma_smallest,                        //
+    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n16k8_loop128_ptx_kernel", //
+    64, 16, 8, 128, 90)
     ->MinTime(10);
 
 BENCHMARK_CAPTURE(                                                             //
-    theoretic_tops_ptx, tf32f32_sm90tc_wgmma_smallest,                         //
-    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n16k8_1024loop_ptx_kernel", //
-    64, 16, 8, 1024, 90)
-    ->MinTime(10);
-
-BENCHMARK_CAPTURE(                                                              //
-    theoretic_tops_ptx, tf32f32_sm90tc_wgmma_largest,                           //
-    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n256k8_1024loop_ptx_kernel", //
-    64, 256, 8, 1024, 90)
+    theoretic_tops_ptx, tf32f32_sm90tc_wgmma_largest,                          //
+    "less_slow_sm90a.ptx", "tops_tf32f32_sm90tc_m64n256k8_loop128_ptx_kernel", //
+    64, 256, 8, 128, 90)
     ->MinTime(10);
 
-BENCHMARK_CAPTURE(                                                                 //
-    theoretic_tops_ptx, b1i32and_sm90tc_wgmma,                                     //
-    "less_slow_sm90a.ptx", "tops_b1i32and_sm90tc_m64n256k256_1024loop_ptx_kernel", //
-    64, 256, 256, 1024, 90)
+BENCHMARK_CAPTURE(                                                                //
+    theoretic_tops_ptx, b1i32and_sm90tc_wgmma,                                    //
+    "less_slow_sm90a.ptx", "tops_b1i32and_sm90tc_m64n256k256_loop128_ptx_kernel", //
+    64, 256, 256, 128, 90)
     ->MinTime(10);
 
 /**
diff --git a/less_slow.cu b/less_slow.cu
@@ -180,7 +180,7 @@ void reverse_and_sort_with_cub(std::uint32_t *device_pointer, std::size_t array_
  *  @brief  A CUDA kernel that @b repeatedly computes the product of two small
  *          matrices of size MxN and NxK using Tensor Cores.
  */
-template <typename input_type_, typename output_type_, int m_, int n_, int k_, int repetitions_>
+template <typename input_type_, typename output_type_, int m_, int n_, int k_, int repetitions_ = 128>
 __device__ inline void tops_tc_cuda_kernel() {
     using namespace nvcuda;
     wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_frag;
@@ -210,7 +210,7 @@ __device__ inline void tops_tc_cuda_kernel() {
  *
  *  @see Docs: https://docs.nvidia.com/cuda/cuda-c-programming-guide/#sub-byte-operations
  */
-template <typename input_type_, typename output_type_, int m_, int n_, int k_, int repetitions_>
+template <typename input_type_, typename output_type_, int m_, int n_, int k_, int repetitions_ = 128>
 __device__ inline void binary_tops_tc_cuda_kernel( //
     nvcuda::wmma::experimental::bmmaBitOp bit_op, nvcuda::wmma::experimental::bmmaAccumulateOp acc_op) {
     using namespace nvcuda;
@@ -225,48 +225,48 @@ __device__ inline void binary_tops_tc_cuda_kernel( //
 
 #pragma region Volta
 
-__global__ void tops_f16f16_sm70tc_16x16x16_1024unroll_cuda_kernel() {
+__global__ void tops_f16f16_sm70tc_16x16x16_loop128_cuda_kernel() {
     //? On Volta: 8x8x4.
     //? On Turing: 8x8x4 / 16x8x8 / 16x8x16.
     //? On Ampere: 16x8x8 / 16x8x16.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
-    tops_tc_cuda_kernel<half, half, 16, 16, 16, 1024>();
+    tops_tc_cuda_kernel<half, half, 16, 16, 16>();
 #endif
 }
-__global__ void tops_f16f32_sm70tc_16x16x16_1024unroll_cuda_kernel() {
+__global__ void tops_f16f32_sm70tc_16x16x16_loop128_cuda_kernel() {
     //? On Volta: 8x8x4.
     //? On Turing: 8x8x4 / 16x8x8 / 16x8x16.
     //? On Ampere: 16x8x8 / 16x8x16.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 700)
-    tops_tc_cuda_kernel<half, float, 16, 16, 16, 1024>();
+    tops_tc_cuda_kernel<half, float, 16, 16, 16>();
 #endif
 }
 
 #pragma endregion
 
 #pragma region Turing
 
-__global__ void tops_u8i32_sm75tc_16x16x16_1024unroll_cuda_kernel() {
+__global__ void tops_u8i32_sm75tc_16x16x16_loop128_cuda_kernel() {
     //? On Turing: 8x8x16.
     //? On Ampere: 8x8x16 / 16x8x16 / 16x8x32.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
-    tops_tc_cuda_kernel<std::uint8_t, int32_t, 16, 16, 16, 1024>();
+    tops_tc_cuda_kernel<std::uint8_t, int32_t, 16, 16, 16>();
 #endif
 }
-__global__ void tops_u4i32_sm75tc_8x8x32_1024unroll_cuda_kernel() {
+__global__ void tops_u4i32_sm75tc_8x8x32_loop128_cuda_kernel() {
     //! The 16x16x16 won't compile, 8x8x32 will.
     //? On Turing: 8x8x32.
     //? On Ampere: 8x8x32 / 16x8x32 / 16x8x64.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
-    tops_tc_cuda_kernel<nvcuda::wmma::experimental::precision::u4, int32_t, 8, 8, 32, 1024>();
+    tops_tc_cuda_kernel<nvcuda::wmma::experimental::precision::u4, int32_t, 8, 8, 32>();
 #endif
 }
-__global__ void tops_b1i32xor_sm75tc_8x8x128_1024unroll_cuda_kernel() {
+__global__ void tops_b1i32xor_sm75tc_8x8x128_loop128_cuda_kernel() {
     //! The 16x16x16 won't compile, 8x8x128 will.
     //? On Turing: 8x8x128.
     //? On Ampere: 8x8x128 / 16x8x128 / 16x8x256.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
-    binary_tops_tc_cuda_kernel<nvcuda::wmma::experimental::precision::b1, int32_t, 8, 8, 128, 1024>(
+    binary_tops_tc_cuda_kernel<nvcuda::wmma::experimental::precision::b1, int32_t, 8, 8, 128>(
         nvcuda::wmma::experimental::bmmaBitOp::bmmaBitOpXOR,
         nvcuda::wmma::experimental::bmmaAccumulateOp::bmmaAccumulateOpPOPC);
 #endif
@@ -276,32 +276,32 @@ __global__ void tops_b1i32xor_sm75tc_8x8x128_1024unroll_cuda_kernel() {
 
 #pragma region Ampere
 
-__global__ void tops_bf16f32_sm80tc_16x16x16_1024unroll_cuda_kernel() {
+__global__ void tops_bf16f32_sm80tc_16x16x16_loop128_cuda_kernel() {
     //? On Ampere: 16x8x8 / 16x8x16.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    tops_tc_cuda_kernel<__nv_bfloat16, float, 16, 16, 16, 1024>();
+    tops_tc_cuda_kernel<__nv_bfloat16, float, 16, 16, 16>();
 #endif
 }
-__global__ void tops_tf32f32_sm80tc_16x16x8_1024unroll_cuda_kernel() {
+__global__ void tops_tf32f32_sm80tc_16x16x8_loop128_cuda_kernel() {
     //! The 16x16x16 won't compile, 16x16x8 will.
     //? On Ampere: 16x8x4.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    tops_tc_cuda_kernel<nvcuda::wmma::precision::tf32, float, 16, 16, 8, 1024>();
+    tops_tc_cuda_kernel<nvcuda::wmma::precision::tf32, float, 16, 16, 8>();
 #endif
 }
-__global__ void tops_f64f64_sm80tc_8x8x4_1024unroll_cuda_kernel() {
+__global__ void tops_f64f64_sm80tc_8x8x4_loop128_cuda_kernel() {
     //! The 16x16x16 won't compile, 8x8x4 will.
     //? On Ampere: 8x8x4.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    tops_tc_cuda_kernel<double, double, 8, 8, 4, 1024>();
+    tops_tc_cuda_kernel<double, double, 8, 8, 4>();
 #endif
 }
 
-__global__ void tops_b1i32and_sm80tc_8x8x128_1024unroll_cuda_kernel() {
+__global__ void tops_b1i32and_sm80tc_8x8x128_loop128_cuda_kernel() {
     //! The 16x16x16 won't compile, 8x8x128 will.
     //? On Ampere: 8x8x128 / 16x8x128 / 16x8x256.
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
-    binary_tops_tc_cuda_kernel<nvcuda::wmma::experimental::precision::b1, int32_t, 8, 8, 128, 1024>(
+    binary_tops_tc_cuda_kernel<nvcuda::wmma::experimental::precision::b1, int32_t, 8, 8, 128>(
         nvcuda::wmma::experimental::bmmaBitOp::bmmaBitOpAND,
         nvcuda::wmma::experimental::bmmaAccumulateOp::bmmaAccumulateOpPOPC);
 #endif
diff --git a/less_slow_sm70.ptx b/less_slow_sm70.ptx
@@ -37,7 +37,7 @@
 .target sm_70            // Target architecture (SM 7.0 - Volta GPUs)
 .address_size 64         // 64-bit addressing
 
-.visible .entry tops_f16f16_sm70tc_16x16x16_1024loop_ptx_kernel()
+.visible .entry tops_f16f16_sm70tc_16x16x16_loop128_ptx_kernel()
 {
     // Accumulator registers used for both input and output of the MMA operation
     .reg .b32 accum_0, accum_1, accum_2, accum_3;
@@ -58,7 +58,7 @@
 
     // Set up loop counter and loop limit
     mov.u32 loop_counter, 0;
-    mov.u32 loop_limit, 1024;
+    mov.u32 loop_limit, 128;
 
     // Zero-initialize the accumulator registers
     mov.f32 accum_0, 0.0;
@@ -89,7 +89,7 @@
     mov.b32 matrix_b_6, packed_const;
     mov.b32 matrix_b_7, packed_const;
 
-    // The main loop will repeat for 1024 iterations
+    // The main loop will repeat for 128 iterations
 loop_start:
     setp.ge.u32 exit_predicate, loop_counter, loop_limit;
     @exit_predicate bra loop_end;
diff --git a/less_slow_sm90a.ptx b/less_slow_sm90a.ptx