Skip to content

Commit 9296d1f

Browse files
committed
Apply auto-formatting by clang
1 parent 8fc2c03 commit 9296d1f

File tree

4 files changed

+32
-23
lines changed

4 files changed

+32
-23
lines changed

ggml/src/ggml-cuda/mean.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
1515
const int64_t nrows = ggml_nrows(src0);
1616

1717
const dim3 block_nums(nrows, 1, 1);
18-
if ((nrows / ctx.sm_count)< 2){
18+
if ((nrows / ctx.sm_count) < 2) {
1919
constexpr dim3 block_dims(512, 1, 1);
20-
reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
20+
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
2121
} else {
2222
const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1);
23-
reduce_rows_f32</*norm=*/ true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
23+
reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
2424
}
2525
}

ggml/src/ggml-cuda/reduce_rows.cuh

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,39 @@
11
#include "common.cuh"
22

33
// Row reduction kernel template - compute sum (norm=false) or mean (norm=true)
4-
template<bool norm>
4+
template <bool norm>
55
static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) {
66
const int row = blockIdx.x;
77
const int col = threadIdx.x;
88

9-
float sum = 0.0f;
9+
float sum = 0.0f;
1010
const int num_unroll = 8;
11-
float temp[num_unroll];
12-
float sum_temp[num_unroll] = {0.0f};
11+
float temp[num_unroll];
12+
float sum_temp[num_unroll] = { 0.0f };
1313
for (int i = col; i < ncols;) {
14-
for (int j = 0; j < num_unroll; ++j){
15-
if (i < ncols){
14+
for (int j = 0; j < num_unroll; ++j) {
15+
if (i < ncols) {
1616
temp[j] = x[row * ncols + i];
17-
}
18-
else {
17+
} else {
1918
temp[j] = 0;
2019
}
2120
i += blockDim.x;
2221
}
23-
for (int j = 0; j < num_unroll; ++j){
22+
for (int j = 0; j < num_unroll; ++j) {
2423
sum_temp[j] += temp[j];
2524
}
2625
}
27-
for (int j = 0; j < num_unroll; ++j){
28-
sum += sum_temp[j];
26+
for (int j = 0; j < num_unroll; ++j) {
27+
sum += sum_temp[j];
2928
}
3029

3130
// sum up partial sums
3231
sum = warp_reduce_sum(sum);
3332
if (blockDim.x > WARP_SIZE) {
3433
assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0);
3534
__shared__ float s_sum[32];
36-
const int warp_id = threadIdx.x / WARP_SIZE;
37-
const int lane_id = threadIdx.x % WARP_SIZE;
35+
const int warp_id = threadIdx.x / WARP_SIZE;
36+
const int lane_id = threadIdx.x % WARP_SIZE;
3837
if (lane_id == 0) {
3938
s_sum[warp_id] = sum;
4039
}
@@ -51,4 +50,4 @@ static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __r
5150
}
5251

5352
dst[row] = norm ? sum / ncols : sum;
54-
}
53+
}

ggml/src/ggml-cuda/sum.cuh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
#include "common.cuh"
22

3-
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream, int & n_sum);
3+
void sum_f32_cuda(ggml_cuda_pool & pool,
4+
const float * x,
5+
float * dst,
6+
const int64_t ne,
7+
cudaStream_t stream,
8+
int & n_sm);
49

510
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

ggml/src/ggml-cuda/sumrows.cu

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,14 @@
1-
#include "sumrows.cuh"
21
#include "reduce_rows.cuh"
2+
#include "sumrows.cuh"
33

4-
void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream, int & n_sm) {
4+
void sum_rows_f32_cuda(const float * x,
5+
float * dst,
6+
const int ncols,
7+
const int nrows,
8+
cudaStream_t stream,
9+
int & n_sm) {
510
const dim3 block_nums(nrows, 1, 1);
6-
if ((nrows / n_sm)< 2){
11+
if ((nrows / n_sm) < 2) {
712
const dim3 block_dims(512, 1, 1);
813
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(x, dst, ncols);
914
} else {
@@ -26,8 +31,8 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
2631
const int64_t nrows = ggml_nrows(src0);
2732

2833
const dim3 block_nums(nrows, 1, 1);
29-
30-
if ((nrows / ctx.sm_count)< 2){
34+
35+
if ((nrows / ctx.sm_count) < 2) {
3136
// Increase num threads to 512 for small nrows to better hide the latency
3237
const dim3 block_dims(512, 1, 1);
3338
reduce_rows_f32</*norm=*/false><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);

0 commit comments

Comments
 (0)