Skip to content

Commit f264793

Browse files
committed
Addressed code review comments: GGML_WARP_SYNC_MASK renamed to GGML_CUDA_WARP_MASK
1 parent 493f96a commit f264793

File tree

7 files changed

+30
-30
lines changed

7 files changed

+30
-30
lines changed

ggml/src/ggml-cuda/argmax.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
2222

2323
#pragma unroll
2424
for (int offset = 16; offset > 0; offset >>= 1) {
25-
const float val = __shfl_xor_sync(GGML_WARP_SYNC_MASK, maxval, offset, WARP_SIZE);
26-
const int col = __shfl_xor_sync(GGML_WARP_SYNC_MASK, argmax, offset, WARP_SIZE);
25+
const float val = __shfl_xor_sync(GGML_CUDA_WARP_MASK, maxval, offset, WARP_SIZE);
26+
const int col = __shfl_xor_sync(GGML_CUDA_WARP_MASK, argmax, offset, WARP_SIZE);
2727
if (val > maxval) {
2828
maxval = val;
2929
argmax = col;
@@ -51,8 +51,8 @@ static __global__ void argmax_f32(const float * __restrict__ x, int32_t * __rest
5151
}
5252
#pragma unroll
5353
for (int offset = 16; offset > 0; offset >>= 1) {
54-
const float val = __shfl_xor_sync(GGML_WARP_SYNC_MASK, maxval, offset, WARP_SIZE);
55-
const int col = __shfl_xor_sync(GGML_WARP_SYNC_MASK, argmax, offset, WARP_SIZE);
54+
const float val = __shfl_xor_sync(GGML_CUDA_WARP_MASK, maxval, offset, WARP_SIZE);
55+
const int col = __shfl_xor_sync(GGML_CUDA_WARP_MASK, argmax, offset, WARP_SIZE);
5656
if (val > maxval) {
5757
maxval = val;
5858
argmax = col;

ggml/src/ggml-cuda/common.cuh

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,9 @@
3535
#include "vendors/cuda.h"
3636
#endif // defined(GGML_USE_HIP)
3737

38-
#ifndef GGML_WARP_SYNC_MASK
39-
#define GGML_WARP_SYNC_MASK 0xffffffff
40-
#endif
38+
#ifndef GGML_CUDA_WARP_MASK
39+
#define GGML_CUDA_WARP_MASK 0xffffffff
40+
#endif // GGML_CUDA_WARP_MASK
4141

4242
#define STRINGIZE_IMPL(...) #__VA_ARGS__
4343
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
@@ -380,11 +380,11 @@ struct ggml_cuda_unroll<1> {
380380
template<int width = WARP_SIZE>
381381
static __device__ __forceinline__ int warp_reduce_sum(int x) {
382382
#if (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
383-
return __reduce_add_sync(GGML_WARP_SYNC_MASK, x);
383+
return __reduce_add_sync(GGML_CUDA_WARP_MASK, x);
384384
#else
385385
#pragma unroll
386386
for (int offset = width/2; offset > 0; offset >>= 1) {
387-
x += __shfl_xor_sync(GGML_WARP_SYNC_MASK, x, offset, width);
387+
x += __shfl_xor_sync(GGML_CUDA_WARP_MASK, x, offset, width);
388388
}
389389
return x;
390390
#endif // (!defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE) || (defined(GGML_USE_HIP) && HIP_VERSION >= 70000000)
@@ -394,7 +394,7 @@ template<int width = WARP_SIZE>
394394
static __device__ __forceinline__ float warp_reduce_sum(float x) {
395395
#pragma unroll
396396
for (int offset = width/2; offset > 0; offset >>= 1) {
397-
x += __shfl_xor_sync(GGML_WARP_SYNC_MASK, x, offset, width);
397+
x += __shfl_xor_sync(GGML_CUDA_WARP_MASK, x, offset, width);
398398
}
399399
return x;
400400
}
@@ -403,8 +403,8 @@ template<int width = WARP_SIZE>
403403
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
404404
#pragma unroll
405405
for (int offset = width/2; offset > 0; offset >>= 1) {
406-
a.x += __shfl_xor_sync(GGML_WARP_SYNC_MASK, a.x, offset, width);
407-
a.y += __shfl_xor_sync(GGML_WARP_SYNC_MASK, a.y, offset, width);
406+
a.x += __shfl_xor_sync(GGML_CUDA_WARP_MASK, a.x, offset, width);
407+
a.y += __shfl_xor_sync(GGML_CUDA_WARP_MASK, a.y, offset, width);
408408
}
409409
return a;
410410
}
@@ -414,7 +414,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
414414
#ifdef FP16_AVAILABLE
415415
#pragma unroll
416416
for (int offset = width/2; offset > 0; offset >>= 1) {
417-
a = __hadd2(a, __shfl_xor_sync(GGML_WARP_SYNC_MASK, a, offset, width));
417+
a = __hadd2(a, __shfl_xor_sync(GGML_CUDA_WARP_MASK, a, offset, width));
418418
}
419419
return a;
420420

@@ -449,20 +449,20 @@ static __device__ __forceinline__ int warp_reduce_all(int x) {
449449
#ifdef GGML_USE_HIP
450450
#pragma unroll
451451
for (int offset = width/2; offset > 0; offset >>= 1) {
452-
x = x && __shfl_xor_sync(GGML_WARP_SYNC_MASK, x, offset, width);
452+
x = x && __shfl_xor_sync(GGML_CUDA_WARP_MASK, x, offset, width);
453453
}
454454
return x;
455455
#else
456456
static_assert(width == WARP_SIZE, "width != WARP_SIZE not implemented");
457-
return __all_sync(GGML_WARP_SYNC_MASK, x);
457+
return __all_sync(GGML_CUDA_WARP_MASK, x);
458458
#endif // GGML_USE_HIP
459459
}
460460

461461
template<int width = WARP_SIZE>
462462
static __device__ __forceinline__ float warp_reduce_max(float x) {
463463
#pragma unroll
464464
for (int offset = width/2; offset > 0; offset >>= 1) {
465-
x = fmaxf(x, __shfl_xor_sync(GGML_WARP_SYNC_MASK, x, offset, width));
465+
x = fmaxf(x, __shfl_xor_sync(GGML_CUDA_WARP_MASK, x, offset, width));
466466
}
467467
return x;
468468
}
@@ -505,7 +505,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
505505
#if !defined(GGML_USE_HIP) && __CUDA_ARCH__ >= GGML_CUDA_CC_PASCAL || (defined(GGML_USE_HIP) && HIP_VERSION >= 50700000)
506506
#pragma unroll
507507
for (int offset = width/2; offset > 0; offset >>= 1) {
508-
x = ggml_cuda_hmax2(x, __shfl_xor_sync(GGML_WARP_SYNC_MASK, x, offset, width));
508+
x = ggml_cuda_hmax2(x, __shfl_xor_sync(GGML_CUDA_WARP_MASK, x, offset, width));
509509
}
510510
return x;
511511
#else

ggml/src/ggml-cuda/fattn-common.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,8 @@ static __device__ __forceinline__ void quantize_q8_1_to_shared(
314314
}
315315
#pragma unroll
316316
for (int mask = QI8_1/2; mask > 0; mask >>= 1) {
317-
amax = fmaxf(amax, __shfl_xor_sync(GGML_WARP_SYNC_MASK, amax, mask, 32));
318-
sum += __shfl_xor_sync(GGML_WARP_SYNC_MASK, sum, mask, 32);
317+
amax = fmaxf(amax, __shfl_xor_sync(GGML_CUDA_WARP_MASK, amax, mask, 32));
318+
sum += __shfl_xor_sync(GGML_CUDA_WARP_MASK, sum, mask, 32);
319319
}
320320

321321
const float d = amax / 127;

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -572,7 +572,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
572572
for (int col = 0; col < cols_per_thread; ++col) {
573573
#pragma unroll
574574
for (int offset = 16; offset >= 4; offset >>= 1) {
575-
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(GGML_WARP_SYNC_MASK, KQ_max_new[col], offset, WARP_SIZE));
575+
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(GGML_CUDA_WARP_MASK, KQ_max_new[col], offset, WARP_SIZE));
576576
}
577577
}
578578

@@ -627,7 +627,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
627627
for (int col = 0; col < cols_per_thread; ++col) {
628628
#pragma unroll
629629
for (int offset = 2; offset >= 1; offset >>= 1) {
630-
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(GGML_WARP_SYNC_MASK, KQ_max_new[col], offset, WARP_SIZE));
630+
KQ_max_new[col] = fmaxf(KQ_max_new[col], __shfl_xor_sync(GGML_CUDA_WARP_MASK, KQ_max_new[col], offset, WARP_SIZE));
631631
}
632632
}
633633

@@ -953,7 +953,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
953953
for (int col = 0; col < cols_per_thread; ++col) {
954954
#pragma unroll
955955
for (int offset = offset_first; offset >= offset_last; offset >>= 1) {
956-
KQ_rowsum[col] += __shfl_xor_sync(GGML_WARP_SYNC_MASK, KQ_rowsum[col], offset, WARP_SIZE);
956+
KQ_rowsum[col] += __shfl_xor_sync(GGML_CUDA_WARP_MASK, KQ_rowsum[col], offset, WARP_SIZE);
957957
}
958958
}
959959
}
@@ -1086,7 +1086,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
10861086
#pragma unroll
10871087
for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
10881088
if (offset < WARP_SIZE) {
1089-
KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(GGML_WARP_SYNC_MASK, KQ_cmn, offset, WARP_SIZE));
1089+
KQ_cmn = fmaxf(KQ_cmn, __shfl_xor_sync(GGML_CUDA_WARP_MASK, KQ_cmn, offset, WARP_SIZE));
10901090
}
10911091
}
10921092

@@ -1104,7 +1104,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
11041104
#pragma unroll
11051105
for (int offset = np*cols_per_warp/2; offset >= cols_per_warp; offset >>= 1) {
11061106
if (offset < WARP_SIZE) {
1107-
KQ_crs += __shfl_xor_sync(GGML_WARP_SYNC_MASK, KQ_crs, offset, WARP_SIZE);
1107+
KQ_crs += __shfl_xor_sync(GGML_CUDA_WARP_MASK, KQ_crs, offset, WARP_SIZE);
11081108
}
11091109
}
11101110

ggml/src/ggml-cuda/mma.cuh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ static __device__ __forceinline__ int ggml_cuda_movmatrix(const int x) {
4747
const int shift_low = ((src_j + 0) % 2) * 16;
4848
const int shift_high = ((src_j + 1) % 2) * 16;
4949

50-
const int ret_low = (__shfl_sync(GGML_WARP_SYNC_MASK, x, src_laneid_low, WARP_SIZE) >> shift_low) & 0x0000FFFF;
51-
const int ret_high = (__shfl_sync(GGML_WARP_SYNC_MASK, x, src_laneid_high, WARP_SIZE) << shift_high) & 0xFFFF0000;
50+
const int ret_low = (__shfl_sync(GGML_CUDA_WARP_MASK, x, src_laneid_low, WARP_SIZE) >> shift_low) & 0x0000FFFF;
51+
const int ret_high = (__shfl_sync(GGML_CUDA_WARP_MASK, x, src_laneid_high, WARP_SIZE) << shift_high) & 0xFFFF0000;
5252

5353
return ret_low | ret_high;
5454
}

ggml/src/ggml-cuda/quantize.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ static __global__ void quantize_mmq_q8_1(
8989
// Exchange max. abs. value between vals_per_scale/4 threads.
9090
#pragma unroll
9191
for (int offset = vals_per_scale/8; offset > 0; offset >>= 1) {
92-
amax = fmaxf(amax, __shfl_xor_sync(GGML_WARP_SYNC_MASK, amax, offset, WARP_SIZE));
92+
amax = fmaxf(amax, __shfl_xor_sync(GGML_CUDA_WARP_MASK, amax, offset, WARP_SIZE));
9393
}
9494

9595
float sum;
@@ -99,7 +99,7 @@ static __global__ void quantize_mmq_q8_1(
9999
// Calculate sums across vals_per_sum/4 threads.
100100
#pragma unroll
101101
for (int offset = vals_per_sum/8; offset > 0; offset >>= 1) {
102-
sum += __shfl_xor_sync(GGML_WARP_SYNC_MASK, sum, offset, WARP_SIZE);
102+
sum += __shfl_xor_sync(GGML_CUDA_WARP_MASK, sum, offset, WARP_SIZE);
103103
}
104104
}
105105

ggml/src/ggml-cuda/vendors/hip.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,9 +152,9 @@
152152

153153
// Warp sync functions and masks
154154
#if HIP_VERSION >= 70000000 && defined(GGML_HIP_ROCWMMA_FATTN)
155-
#define GGML_WARP_SYNC_MASK 0xffffffffffffffffULL // ROCm 7.0+ requires 64-bit masks for __*_*_sync functions
155+
#define GGML_CUDA_WARP_MASK 0xffffffffffffffffULL // ROCm 7.0+ requires 64-bit masks for __*_*_sync functions
156156
#else
157-
#define GGML_WARP_SYNC_MASK 0xffffffff
157+
#define GGML_CUDA_WARP_MASK 0xffffffff
158158
#define __shfl_sync(mask, var, laneMask, width) __shfl(var, laneMask, width)
159159
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
160160
#endif // HIP_VERSION >= 70000000 && defined(GGML_HIP_ROCWMMA_FATTN)

0 commit comments

Comments
 (0)