@@ -273,21 +273,21 @@ __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel() {
273273template <typename input_type_, typename output_type_, int m_, int n_, int k_, int repetitions_ = 128 >
274274__device__ inline void tops_tc_cuda_kernel () {
275275 using namespace nvcuda ;
276- wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_frag ;
277- wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_frag ;
278- wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_frag ;
276+ wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_tile ;
277+ wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_tile ;
278+ wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_tile ;
279279
280280 // To initialize, we can call:
281281 //
282- // wmma::fill_fragment(a_frag , 1);
283- // wmma::fill_fragment(b_frag , 1);
284- // wmma::fill_fragment(c_frag , 0);
282+ // wmma::fill_fragment(a_tile , 1);
283+ // wmma::fill_fragment(b_tile , 1);
284+ // wmma::fill_fragment(c_tile , 0);
285285 //
286286 // To better saturate the ALU, we could unroll a few iterations:
287- for (int i = 0 ; i != repetitions_; ++i ) wmma::mma_sync (c_frag, a_frag, b_frag, c_frag );
287+ for (int r = 0 ; r != repetitions_; ++r ) wmma::mma_sync (c_tile, a_tile, b_tile, c_tile );
288288
289289 // Impossible condition to prevent optimization
290- if (threadIdx .x == 2147483647 ) wmma::store_matrix_sync (nullptr , c_frag , 16 , wmma::mem_row_major);
290+ if (threadIdx .x == 2147483647 ) wmma::store_matrix_sync (nullptr , c_tile , 16 , wmma::mem_row_major);
291291}
292292
293293#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) // ? Binary matrices require SM75 or higher
@@ -304,11 +304,11 @@ template <typename input_type_, typename output_type_, int m_, int n_, int k_, i
304304__device__ inline void binary_tops_tc_cuda_kernel ( //
305305 nvcuda::wmma::experimental::bmmaBitOp bit_op, nvcuda::wmma::experimental::bmmaAccumulateOp acc_op) {
306306 using namespace nvcuda ;
307- wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_frag ;
308- wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_frag ;
309- wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_frag ;
310- for (int i = 0 ; i != repetitions_; ++i ) wmma::bmma_sync (c_frag, a_frag, b_frag, c_frag , bit_op, acc_op);
311- if (threadIdx .x == 2147483647 ) wmma::store_matrix_sync (nullptr , c_frag , 16 , wmma::mem_row_major);
307+ wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_tile ;
308+ wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_tile ;
309+ wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_tile ;
310+ for (int r = 0 ; r != repetitions_; ++r ) wmma::bmma_sync (c_tile, a_tile, b_tile, c_tile , bit_op, acc_op);
311+ if (threadIdx .x == 2147483647 ) wmma::store_matrix_sync (nullptr , c_tile , 16 , wmma::mem_row_major);
312312}
313313
314314#endif
@@ -692,7 +692,7 @@ __global__ void tops_f16f32_sm90wgmma_64x256x16_loop128_cuda_kernel() {
692692 std::uint64_t a_descriptor = wgmma_descriptor ((std::uint64_t )a_shared, 128 , 256 , 0 , 0 );
693693 std::uint64_t b_descriptor = wgmma_descriptor ((std::uint64_t )b_shared, 128 * 256 / 8 , 128 , 0 , 0 );
694694 wgmma_fence ();
695- for (int i = 0 ; i != 128 ; ++i ) {
695+ for (int r = 0 ; r != 128 ; ++r ) {
696696 wgmma_f16f32_64x256x16 (c_registers, a_descriptor, b_descriptor);
697697 wgmma_commit_group ();
698698 }
@@ -710,7 +710,7 @@ __global__ void tops_bf16f32_sm90wgmma_64x256x16_loop128_cuda_kernel() {
710710 std::uint64_t a_descriptor = wgmma_descriptor ((std::uint64_t )a_shared, 128 , 256 , 0 , 0 );
711711 std::uint64_t b_descriptor = wgmma_descriptor ((std::uint64_t )b_shared, 128 * 256 / 8 , 128 , 0 , 0 );
712712 wgmma_fence ();
713- for (int i = 0 ; i != 128 ; ++i ) {
713+ for (int r = 0 ; r != 128 ; ++r ) {
714714 wgmma_bf16f32_64x256x16 (c_registers, a_descriptor, b_descriptor);
715715 wgmma_commit_group ();
716716 }
@@ -730,7 +730,7 @@ __global__ void tops_tf32f32_sm90wgmma_64x256x8_loop128_cuda_kernel() {
730730 std::uint64_t a_descriptor = wgmma_descriptor ((std::uint64_t )a_shared, 128 , 256 , 0 , 0 );
731731 std::uint64_t b_descriptor = wgmma_descriptor ((std::uint64_t )b_shared, 128 * 256 / 8 , 128 , 0 , 0 );
732732 wgmma_fence ();
733- for (int i = 0 ; i != 128 ; ++i ) {
733+ for (int r = 0 ; r != 128 ; ++r ) {
734734 wgmma_tf32f32_64x256x8 (c_registers, a_descriptor, b_descriptor);
735735 wgmma_commit_group ();
736736 }
0 commit comments