Skip to content

Commit 22f52c4

Browse files
committed
Fix: Initialize FMA inputs
Otherwise PTXAS optimizes-out integer kernels
1 parent 1ab4f41 commit 22f52c4

File tree

1 file changed

+4
-0
lines changed

1 file changed

+4
-0
lines changed

less_slow.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,10 @@ __device__ void tops_fma_cuda_kernel() {
147147
input_type_ a_tile[matrix_side_][matrix_side_], b_tile[matrix_side_][matrix_side_];
148148
output_type_ c_tile[matrix_side_][matrix_side_];
149149

150+
// Initialize the accumulator with zeros
151+
for (int i = 0; i < matrix_side_; ++i)
152+
for (int j = 0; j < matrix_side_; ++j) a_tile[i][j] = b_tile[i][j] = i * matrix_side_ + j, c_tile[i][j] = 0;
153+
150154
// Repeatedly perform FMA-like operations
151155
fma_operator_ fma_operator;
152156
for (int r = 0; r < repetitions_; ++r) {

0 commit comments

Comments
 (0)