We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 1ab4f41 commit 22f52c4Copy full SHA for 22f52c4
less_slow.cu
@@ -147,6 +147,10 @@ __device__ void tops_fma_cuda_kernel() {
147
input_type_ a_tile[matrix_side_][matrix_side_], b_tile[matrix_side_][matrix_side_];
148
output_type_ c_tile[matrix_side_][matrix_side_];
149
150
+ // Initialize the accumulator with zeros
151
+ for (int i = 0; i < matrix_side_; ++i)
152
+ for (int j = 0; j < matrix_side_; ++j) a_tile[i][j] = b_tile[i][j] = i * matrix_side_ + j, c_tile[i][j] = 0;
153
+
154
// Repeatedly perform FMA-like operations
155
fma_operator_ fma_operator;
156
for (int r = 0; r < repetitions_; ++r) {
0 commit comments