Skip to content

Commit 80e1d83

Browse files
committed
Improve: Naming variables
1 parent 306ee3f commit 80e1d83

File tree

1 file changed

+16
-16
lines changed

1 file changed

+16
-16
lines changed

less_slow.cu

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -273,21 +273,21 @@ __global__ void tops_i64i64_sm60fma_16x16x16_loop128_cuda_kernel() {
273273
template <typename input_type_, typename output_type_, int m_, int n_, int k_, int repetitions_ = 128>
274274
__device__ inline void tops_tc_cuda_kernel() {
275275
using namespace nvcuda;
276-
wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_frag;
277-
wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_frag;
278-
wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_frag;
276+
wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_tile;
277+
wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_tile;
278+
wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_tile;
279279

280280
// To initialize, we can call:
281281
//
282-
// wmma::fill_fragment(a_frag, 1);
283-
// wmma::fill_fragment(b_frag, 1);
284-
// wmma::fill_fragment(c_frag, 0);
282+
// wmma::fill_fragment(a_tile, 1);
283+
// wmma::fill_fragment(b_tile, 1);
284+
// wmma::fill_fragment(c_tile, 0);
285285
//
286286
// To better saturate the ALU, we could unroll a few iterations:
287-
for (int i = 0; i != repetitions_; ++i) wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
287+
for (int r = 0; r != repetitions_; ++r) wmma::mma_sync(c_tile, a_tile, b_tile, c_tile);
288288

289289
// Impossible condition to prevent optimization
290-
if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major);
290+
if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_tile, 16, wmma::mem_row_major);
291291
}
292292

293293
#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) //? Binary matrices require SM75 or higher
@@ -304,11 +304,11 @@ template <typename input_type_, typename output_type_, int m_, int n_, int k_, i
304304
__device__ inline void binary_tops_tc_cuda_kernel( //
305305
nvcuda::wmma::experimental::bmmaBitOp bit_op, nvcuda::wmma::experimental::bmmaAccumulateOp acc_op) {
306306
using namespace nvcuda;
307-
wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_frag;
308-
wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_frag;
309-
wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_frag;
310-
for (int i = 0; i != repetitions_; ++i) wmma::bmma_sync(c_frag, a_frag, b_frag, c_frag, bit_op, acc_op);
311-
if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_frag, 16, wmma::mem_row_major);
307+
wmma::fragment<wmma::matrix_a, m_, n_, k_, input_type_, wmma::row_major> a_tile;
308+
wmma::fragment<wmma::matrix_b, m_, n_, k_, input_type_, wmma::col_major> b_tile;
309+
wmma::fragment<wmma::accumulator, m_, n_, k_, output_type_> c_tile;
310+
for (int r = 0; r != repetitions_; ++r) wmma::bmma_sync(c_tile, a_tile, b_tile, c_tile, bit_op, acc_op);
311+
if (threadIdx.x == 2147483647) wmma::store_matrix_sync(nullptr, c_tile, 16, wmma::mem_row_major);
312312
}
313313

314314
#endif
@@ -692,7 +692,7 @@ __global__ void tops_f16f32_sm90wgmma_64x256x16_loop128_cuda_kernel() {
692692
std::uint64_t a_descriptor = wgmma_descriptor((std::uint64_t)a_shared, 128, 256, 0, 0);
693693
std::uint64_t b_descriptor = wgmma_descriptor((std::uint64_t)b_shared, 128 * 256 / 8, 128, 0, 0);
694694
wgmma_fence();
695-
for (int i = 0; i != 128; ++i) {
695+
for (int r = 0; r != 128; ++r) {
696696
wgmma_f16f32_64x256x16(c_registers, a_descriptor, b_descriptor);
697697
wgmma_commit_group();
698698
}
@@ -710,7 +710,7 @@ __global__ void tops_bf16f32_sm90wgmma_64x256x16_loop128_cuda_kernel() {
710710
std::uint64_t a_descriptor = wgmma_descriptor((std::uint64_t)a_shared, 128, 256, 0, 0);
711711
std::uint64_t b_descriptor = wgmma_descriptor((std::uint64_t)b_shared, 128 * 256 / 8, 128, 0, 0);
712712
wgmma_fence();
713-
for (int i = 0; i != 128; ++i) {
713+
for (int r = 0; r != 128; ++r) {
714714
wgmma_bf16f32_64x256x16(c_registers, a_descriptor, b_descriptor);
715715
wgmma_commit_group();
716716
}
@@ -730,7 +730,7 @@ __global__ void tops_tf32f32_sm90wgmma_64x256x8_loop128_cuda_kernel() {
730730
std::uint64_t a_descriptor = wgmma_descriptor((std::uint64_t)a_shared, 128, 256, 0, 0);
731731
std::uint64_t b_descriptor = wgmma_descriptor((std::uint64_t)b_shared, 128 * 256 / 8, 128, 0, 0);
732732
wgmma_fence();
733-
for (int i = 0; i != 128; ++i) {
733+
for (int r = 0; r != 128; ++r) {
734734
wgmma_tf32f32_64x256x8(c_registers, a_descriptor, b_descriptor);
735735
wgmma_commit_group();
736736
}

0 commit comments

Comments
 (0)