Skip to content

Commit b20f068

Browse files
committed
Renames the rest of the compute capability macros for consistency.
1 parent 3974cf6 commit b20f068

File tree

6 files changed

+29
-29
lines changed

6 files changed

+29
-29
lines changed

ggml/src/ggml-cuda/common.cuh

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -46,23 +46,23 @@
4646
#define GGML_CUDA_CC_VOLTA 700
4747
#define GGML_CUDA_CC_TURING 750
4848
#define GGML_CUDA_CC_AMPERE 800
49-
#define CC_OFFSET_AMD 1000000
49+
#define GGML_CUDA_CC_OFFSET_AMD 1000000
5050

5151
// GCN/CNDA, wave size is 64
52-
#define CC_GCN4 (CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53-
#define CC_VEGA (CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54-
#define CC_VEGA20 (CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55-
#define CC_CDNA (CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56-
#define CC_CDNA2 (CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57-
#define CC_CDNA3 (CC_OFFSET_AMD + 942) // MI300
52+
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 803) // Tonga, Fiji, Polaris, minimum for fast fp16
53+
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 900) // Vega56/64, minimum for fp16 dual issue
54+
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 906) // MI50/Radeon VII, minimum for dp4a
55+
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 908) // MI100, minimum for MFMA, acc registers
56+
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 910) // MI210, minimum acc register renameing
57+
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 942) // MI300
5858

5959
// RNDA removes MFMA, dp4a, xnack, acc registers, wave size is 32
60-
#define CC_RDNA1 (CC_OFFSET_AMD + 1010) // RX 5000
61-
#define CC_RDNA2 (CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62-
#define CC_RDNA3 (CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
60+
#define GGML_CUDA_CC_RDNA1 (GGML_CUDA_CC_OFFSET_AMD + 1010) // RX 5000
61+
#define GGML_CUDA_CC_RDNA2 (GGML_CUDA_CC_OFFSET_AMD + 1030) // RX 6000, minimum for dp4a
62+
#define GGML_CUDA_CC_RDNA3 (GGML_CUDA_CC_OFFSET_AMD + 1100) // RX 7000, minimum for WMMA
6363

64-
#define CC_QY1 210
65-
#define CC_QY2 220
64+
#define GGML_CUDA_CC_QY1 210
65+
#define GGML_CUDA_CC_QY2 220
6666

6767
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
6868

@@ -147,20 +147,20 @@ typedef float2 dfloat2;
147147
#define INT8_MMA_AVAILABLE
148148
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
149149

150-
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
150+
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
151151
#define FLASH_ATTN_AVAILABLE
152-
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
152+
#endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
153153

154154
static constexpr bool fast_fp16_available(const int cc) {
155155
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
156156
}
157157

158158
static constexpr bool fp16_mma_available(const int cc) {
159-
return cc < CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
159+
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA;
160160
}
161161

162162
static constexpr bool int8_mma_available(const int cc) {
163-
return cc < CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
163+
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
164164
}
165165

166166
[[noreturn]]

ggml/src/ggml-cuda/fattn.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
304304
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
305305

306306
// On AMD the tile kernels perform poorly, use the vec kernel instead:
307-
if (cc >= CC_OFFSET_AMD) {
307+
if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
308308
if (prec == GGML_PREC_DEFAULT && fast_fp16_available(cc)) {
309309
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
310310
} else {

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
177177
info.devices[id].smpb = prop.sharedMemPerBlock;
178178
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
179179
info.devices[id].smpbo = prop.sharedMemPerBlock;
180-
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
180+
info.devices[id].cc = 100*prop.major + 10*prop.minor + GGML_CUDA_CC_OFFSET_AMD;
181181
#else
182182
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
183183
info.devices[id].cc = 100*prop.major + 10*prop.minor;
@@ -1108,7 +1108,7 @@ static void ggml_cuda_op_mul_mat_cublas(
11081108
const half beta_f16 = 0.0f;
11091109

11101110
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
1111-
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
1111+
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
11121112
cu_compute_type = CUBLAS_COMPUTE_32F;
11131113
}
11141114

@@ -1612,7 +1612,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
16121612
cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F;
16131613
cudaDataType_t cu_data_type = CUDA_R_16F;
16141614

1615-
if (ggml_cuda_info().devices[ctx.device].cc == CC_CDNA) {
1615+
if (ggml_cuda_info().devices[ctx.device].cc == GGML_CUDA_CC_CDNA) {
16161616
cu_compute_type = CUBLAS_COMPUTE_32F;
16171617
}
16181618

@@ -3028,7 +3028,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
30283028
return true;
30293029
}
30303030
const int cc = ggml_cuda_info().devices[dev_ctx->device].cc;
3031-
return cc >= GGML_CUDA_CC_VOLTA && cc < CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
3031+
return cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD && op->src[1]->type == GGML_TYPE_F16 && op->src[2]->type == GGML_TYPE_F16;
30323032
}
30333033
case GGML_OP_CROSS_ENTROPY_LOSS:
30343034
case GGML_OP_CROSS_ENTROPY_LOSS_BACK:

ggml/src/ggml-cuda/mmq.cu

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void ggml_cuda_op_mul_mat_q(
2727
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
2828
// Also its fixup needs to allocate a temporary buffer in the memory pool.
2929
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
30-
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < CC_OFFSET_AMD && src1_ncols == ne11;
30+
const bool use_stream_k = compute_capability >= GGML_CUDA_CC_VOLTA && compute_capability < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
3131
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
3232

3333
switch (src0->type) {
@@ -144,9 +144,9 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
144144
return true;
145145
#endif //GGML_CUDA_FORCE_MMQ
146146

147-
if (cc < CC_OFFSET_AMD) {
147+
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
148148
return cc < GGML_CUDA_CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
149149
}
150150

151-
return (cc < CC_RDNA3 && cc != CC_CDNA && cc != CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
151+
return (cc < GGML_CUDA_CC_RDNA3 && cc != GGML_CUDA_CC_CDNA && cc != GGML_CUDA_CC_VEGA20) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
152152
}

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,9 @@ struct tile_x_sizes {
8989
static constexpr int get_mmq_x_max_host(const int cc) {
9090
return int8_mma_available(cc) ? 128 :
9191
#ifdef GGML_CUDA_FORCE_MMQ
92-
cc >= GGML_CUDA_CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
92+
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? 128 : 64;
9393
#else
94-
cc >= GGML_CUDA_CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
94+
cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ? MMQ_DP4A_MAX_BATCH_SIZE : 64;
9595
#endif // GGML_CUDA_FORCE_MMQ
9696
}
9797

@@ -120,7 +120,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
120120
}
121121

122122
static constexpr int get_mmq_y_host(const int cc) {
123-
return cc >= CC_OFFSET_AMD ? (cc == CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
123+
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (cc == GGML_CUDA_CC_RDNA1 ? 64 : 128) : (cc >= GGML_CUDA_CC_VOLTA ? 128 : 64);
124124
}
125125

126126
static constexpr __device__ int get_mmq_y_device() {
@@ -2825,7 +2825,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
28252825
const int mmq_x_max = get_mmq_x_max_host(cc);
28262826
const int mmq_y = get_mmq_y_host(cc);
28272827
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
2828-
const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < CC_OFFSET_AMD;
2828+
const bool use_stream_k = cc >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
28292829

28302830
int mmq_x_best = 0;
28312831
int nparts_best = INT_MAX;

ggml/src/ggml-cuda/mmvq.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ static void mul_mat_vec_q_cuda(
142142
int64_t nwarps = 1;
143143
int64_t rows_per_cuda_block = 1;
144144

145-
if (ggml_cuda_info().devices[id].cc < CC_CDNA || ggml_cuda_info().devices[id].cc == CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
145+
if (ggml_cuda_info().devices[id].cc < GGML_CUDA_CC_CDNA || ggml_cuda_info().devices[id].cc == GGML_CUDA_CC_RDNA1) { // NVIDIA and AMD older than RDNA2 but not CDNA
146146
switch(ncols_y) {
147147
case 1:
148148
nwarps = 4;

0 commit comments

Comments
 (0)