@@ -5,10 +5,10 @@ void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int
55 const dim3 block_nums (nrows, 1 , 1 );
66 if ((nrows / n_sm)< 2 ){
77 const dim3 block_dims (512 , 1 , 1 );
8- reduce_rows_f32</* norm=*/ false , 128 ><<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols);
8+ reduce_rows_f32</* norm=*/ false ><<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols);
99 } else {
1010 const dim3 block_dims (128 , 1 , 1 );
11- reduce_rows_f32</* norm=*/ false , 128 ><<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols);
11+ reduce_rows_f32</* norm=*/ false ><<<block_nums, block_dims, 0 , stream>>> (x, dst, ncols);
1212 }
1313}
1414
@@ -30,10 +30,10 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
3030 if ((nrows / ctx.sm_count )< 2 ){
3131 // Increase num threads to 512 for small nrows to better hide the latency
3232 const dim3 block_dims (512 , 1 , 1 );
33- reduce_rows_f32</* norm=*/ false , 128 ><<<block_nums, block_dims, 0 , stream>>> (src0_d, dst_d, ncols);
33+ reduce_rows_f32</* norm=*/ false ><<<block_nums, block_dims, 0 , stream>>> (src0_d, dst_d, ncols);
3434 } else {
3535 // Enough active SMs to hide latency, use smaller blocks to allow better scheduling
3636 const dim3 block_dims (128 , 1 , 1 );
37- reduce_rows_f32</* norm=*/ false , 128 ><<<block_nums, block_dims, 0 , stream>>> (src0_d, dst_d, ncols);
37+ reduce_rows_f32</* norm=*/ false ><<<block_nums, block_dims, 0 , stream>>> (src0_d, dst_d, ncols);
3838 }
3939}
0 commit comments