Skip to content

Commit 21b7d0a

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/rocm.Dockerfile # docs/build-s390x.md # docs/development/HOWTO-add-model.md # docs/ops.md # docs/ops/CPU.csv # docs/ops/CUDA.csv # ggml/CMakeLists.txt # ggml/src/ggml-cann/acl_tensor.cpp # ggml/src/ggml-cann/aclnn_ops.cpp # ggml/src/ggml-cann/aclnn_ops.h # ggml/src/ggml-cann/ggml-cann.cpp # ggml/src/ggml-cpu/CMakeLists.txt # ggml/src/ggml-opencl/ggml-opencl.cpp # ggml/src/ggml-opencl/kernels/rms_norm.cl # scripts/create_ops_docs.py # tests/test-backend-ops.cpp # tools/export-lora/export-lora.cpp
2 parents ba626b3 + 4762ad7 commit 21b7d0a

File tree

19 files changed

+1521
-792
lines changed

19 files changed

+1521
-792
lines changed

convert_hf_to_gguf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3791,7 +3791,7 @@ def set_gguf_parameters(self):
37913791
self.gguf_writer.add_block_count(block_count)
37923792
self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32))
37933793
self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06))
3794-
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 1000000.0))
3794+
self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000))
37953795

37963796
# Mamba parameters
37973797
self.gguf_writer.add_ssm_state_size(hparams.get("mamba_d_state", 64))
@@ -3802,7 +3802,7 @@ def set_gguf_parameters(self):
38023802
self.gguf_writer.add_ssm_group_count(0)
38033803

38043804
# MLP feed forward parameters (for attention layers)
3805-
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 16384))
3805+
self.gguf_writer.add_feed_forward_length(hparams.get("intermediate_size", 13312))
38063806
self.gguf_writer.add_file_type(self.ftype)
38073807

38083808
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:

ggml/src/ggml-cuda/common.cuh

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
5757
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
5858
#define GGML_CUDA_CC_VEGA20 (GGML_CUDA_CC_OFFSET_AMD + 0x906) // MI50/Radeon VII, minimum for dp4a
59-
#define GGML_CUDA_CC_CDNA (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
59+
#define GGML_CUDA_CC_CDNA1 (GGML_CUDA_CC_OFFSET_AMD + 0x908) // MI100, minimum for MFMA, acc registers
6060
#define GGML_CUDA_CC_CDNA2 (GGML_CUDA_CC_OFFSET_AMD + 0x910) // MI210, minimum acc register renameing
6161
#define GGML_CUDA_CC_CDNA3 (GGML_CUDA_CC_OFFSET_AMD + 0x942) // MI300
6262

@@ -72,8 +72,9 @@
7272
#define GGML_CUDA_CC_IS_RDNA2(cc) (cc >= GGML_CUDA_CC_RDNA2 && cc < GGML_CUDA_CC_RDNA3)
7373
#define GGML_CUDA_CC_IS_RDNA3(cc) (cc >= GGML_CUDA_CC_RDNA3 && cc < GGML_CUDA_CC_RDNA4)
7474
#define GGML_CUDA_CC_IS_RDNA4(cc) (cc >= GGML_CUDA_CC_RDNA4)
75-
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
76-
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
75+
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA1)
76+
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA1 && cc < GGML_CUDA_CC_RDNA1)
77+
#define GGML_CUDA_CC_IS_CDNA3(cc) (cc >= GGML_CUDA_CC_CDNA3 && cc < GGML_CUDA_CC_RDNA1)
7778

7879
// Moore Threads
7980
#define GGML_CUDA_CC_QY1 (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
@@ -230,6 +231,10 @@ typedef float2 dfloat2;
230231
#define FP16_MMA_AVAILABLE
231232
#endif // defined(GGML_HIP_ROCWMMA_FATTN) && (defined(CDNA) || defined(RDNA3) || (defined(GGML_HIP_ROCWMMA_FATTN_GFX12) && defined(RDNA4)))
232233

234+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
235+
#define AMD_MFMA_AVAILABLE
236+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && defined(CDNA3)
237+
233238
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
234239
#define NEW_MMA_AVAILABLE
235240
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
@@ -292,6 +297,11 @@ static bool fp32_mma_hardware_available(const int cc) {
292297
return GGML_CUDA_CC_IS_CDNA(cc);
293298
}
294299

300+
// AMD CDNA3 matrix cores.. Will add support for other CDNA generations later.
301+
static bool amd_mfma_available(const int cc) {
302+
return cc >= GGML_CUDA_CC_OFFSET_AMD && GGML_CUDA_CC_IS_CDNA3(cc);
303+
}
304+
295305
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
296306
static bool new_mma_available(const int cc) {
297307
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;

ggml/src/ggml-cuda/fattn-mma-f16.cuh

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1330,14 +1330,16 @@ static __global__ void flash_attn_ext_f16(
13301330
ne01, ne02, stride_Q1, stride_Q2, stride_K, stride_V, stride_mask, jt, kb0_start_kernel, kb0_stop_kernel);
13311331
#else
13321332
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
1333-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
1334-
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
1335-
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap); GGML_UNUSED(ne00);
1336-
GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03); GGML_UNUSED(ne10);
1337-
GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
1338-
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
1339-
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13); GGML_UNUSED(nb21);
1340-
GGML_UNUSED(nb22); GGML_UNUSED(nb23);
1333+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
1334+
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
1335+
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
1336+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
1337+
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
1338+
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
1339+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
1340+
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
1341+
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
1342+
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
13411343
NO_DEVICE_CODE;
13421344
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(NEW_MMA_AVAILABLE)
13431345
}

ggml/src/ggml-cuda/fattn-tile-f32.cu

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,16 @@ static __global__ void flash_attn_tile_ext_f32(
3737
#endif // FP16_MMA_AVAILABLE
3838
if (use_logit_softcap && !(D == 128 || D == 256)) {
3939
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
40-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
41-
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
40+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
41+
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
4242
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
43-
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
44-
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
45-
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32);
46-
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
47-
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
48-
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
49-
GGML_UNUSED(nb23);
43+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
44+
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
45+
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
46+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
47+
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
48+
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
49+
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
5050
NO_DEVICE_CODE;
5151
return;
5252
}
@@ -282,16 +282,16 @@ static __global__ void flash_attn_tile_ext_f32(
282282
}
283283
#else
284284
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
285-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
286-
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
285+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
286+
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
287287
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
288288
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
289-
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
290-
GGML_UNUSED(ne31); GGML_UNUSED(ne32);
291-
GGML_UNUSED(nb31); GGML_UNUSED(nb32);
292289
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
290+
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
293291
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
294292
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
293+
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
294+
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
295295
NO_DEVICE_CODE;
296296
#endif // FLASH_ATTN_AVAILABLE
297297
}

ggml/src/ggml-cuda/fattn-vec-f16.cuh

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -329,16 +329,16 @@ static __global__ void flash_attn_vec_ext_f16(
329329
}
330330
#else
331331
GGML_UNUSED(Q); GGML_UNUSED(K); GGML_UNUSED(V); GGML_UNUSED(mask);
332-
GGML_UNUSED(dst); GGML_UNUSED(dst_meta); GGML_UNUSED(scale);
333-
GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
332+
GGML_UNUSED(dst); GGML_UNUSED(dst_meta);
333+
GGML_UNUSED(scale); GGML_UNUSED(max_bias); GGML_UNUSED(m0); GGML_UNUSED(m1);
334334
GGML_UNUSED(n_head_log2); GGML_UNUSED(logit_softcap);
335-
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02);
336-
GGML_UNUSED(ne03); GGML_UNUSED(ne10); GGML_UNUSED(ne11);
337-
GGML_UNUSED(ne12); GGML_UNUSED(ne13); GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne32);
338-
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33); GGML_UNUSED(nb01); GGML_UNUSED(nb02);
339-
GGML_UNUSED(nb03); GGML_UNUSED(nb11); GGML_UNUSED(nb12);
340-
GGML_UNUSED(nb13); GGML_UNUSED(nb21); GGML_UNUSED(nb22);
341-
GGML_UNUSED(nb23);
335+
GGML_UNUSED(ne00); GGML_UNUSED(ne01); GGML_UNUSED(ne02); GGML_UNUSED(ne03);
336+
GGML_UNUSED(nb01); GGML_UNUSED(nb02); GGML_UNUSED(nb03);
337+
GGML_UNUSED(ne10); GGML_UNUSED(ne11); GGML_UNUSED(ne12); GGML_UNUSED(ne13);
338+
GGML_UNUSED(nb11); GGML_UNUSED(nb12); GGML_UNUSED(nb13);
339+
GGML_UNUSED(nb21); GGML_UNUSED(nb22); GGML_UNUSED(nb23);
340+
GGML_UNUSED(ne31); GGML_UNUSED(ne32); GGML_UNUSED(ne33);
341+
GGML_UNUSED(nb31); GGML_UNUSED(nb32); GGML_UNUSED(nb33);
342342
NO_DEVICE_CODE;
343343
#endif // defined(FLASH_ATTN_AVAILABLE) && defined(FP16_AVAILABLE)
344344
}

ggml/src/ggml-cuda/mma.cuh

Lines changed: 111 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
// The methods get_i and get_j can be used to get the physical 32 bit index of the lth element of a thread within a tile.
1313
// All matrix tiles have ne physical 32 bit elements per warp.
1414
//
15-
// As described in the documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
15+
// As described in the PTX documentation, all pointers for load_ldmatrix must be to shared memory and aligned to 16 bytes.
16+
// The API in this file also assumes that the pointers for load_generic are aligned to 16 bytes, unaligned pointers are considered undefined behavior.
1617

1718
#include "common.cuh"
1819

@@ -66,7 +67,44 @@ namespace ggml_cuda_mma {
6667
struct tile {
6768
static constexpr int I = I_;
6869
static constexpr int J = J_;
69-
static constexpr int ne = I * J / WARP_SIZE;
70+
71+
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
72+
static constexpr int ne = I * J / 64;
73+
T x[ne] = {0};
74+
75+
static __device__ __forceinline__ int get_i(const int l) {
76+
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
77+
return threadIdx.x % 16;
78+
} else if constexpr (I == 16 && J == 8) {
79+
return threadIdx.x % 16;
80+
} else if constexpr (I == 32 && J == 4) {
81+
return threadIdx.x % 32;
82+
} else if constexpr (I == 16 && J == 16) {
83+
return 4 * (threadIdx.x / 16) + l;
84+
} else if constexpr (I == 32 && J == 32) {
85+
return 4 * (threadIdx.x / 32) + 8 * (l / 4) + (l % 4);
86+
} else {
87+
static_assert(I == -1 && J == -1, "template specialization not implemented");
88+
}
89+
}
90+
91+
static __device__ __forceinline__ int get_j(const int l) {
92+
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
93+
return (2 * ((threadIdx.x / 16) % 2) + l);
94+
} else if constexpr (I == 16 && J == 8) {
95+
return 2 * (threadIdx.x / 16) + l;
96+
} else if constexpr (I == 32 && J == 4) {
97+
return 2 * (threadIdx.x / 32) + l;
98+
} else if constexpr (I == 16 && J == 16) {
99+
return threadIdx.x % 16;
100+
} else if constexpr (I == 32 && J == 32) {
101+
return threadIdx.x % 32;
102+
} else {
103+
static_assert(I == -1 && J == -1, "template specialization not implemented");
104+
}
105+
}
106+
#else
107+
static constexpr int ne = I * J / 32;
70108
T x[ne] = {0};
71109

72110
static __device__ __forceinline__ int get_i(const int l) {
@@ -94,6 +132,7 @@ namespace ggml_cuda_mma {
94132
static_assert(I == -1 && J == -1, "template specialization not implemented");
95133
}
96134
}
135+
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
97136
};
98137

99138
template <int I_, int J_>
@@ -148,10 +187,23 @@ namespace ggml_cuda_mma {
148187

149188
template <int I, int J, typename T>
150189
static __device__ __forceinline__ void load_generic(tile<I, J, T> & t, const T * __restrict__ xs0, const int stride) {
190+
#if defined(AMD_MFMA_AVAILABLE)
191+
if constexpr (I == 64 && J == 2) { // Special tile size to load <16, 4> as <16, 8>
192+
#pragma unroll
193+
for (int l = 0; l < t.ne; ++l) {
194+
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
195+
}
196+
} else {
197+
int64_t * xi = (int64_t *) t.x;
198+
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
199+
xi[0] = xs[0];
200+
}
201+
#else
151202
#pragma unroll
152203
for (int l = 0; l < t.ne; ++l) {
153204
t.x[l] = xs0[t.get_i(l)*stride + t.get_j(l)];
154205
}
206+
#endif // defined(AMD_MFMA_AVAILABLE)
155207
}
156208

157209
template <typename T>
@@ -186,7 +238,7 @@ namespace ggml_cuda_mma {
186238
template <typename T>
187239
static __device__ __forceinline__ void load_ldmatrix(
188240
tile<16, 8, T> & t, const T * __restrict__ xs0, const int stride) {
189-
#ifdef NEW_MMA_AVAILABLE
241+
#if defined(NEW_MMA_AVAILABLE)
190242
int * xi = (int * ) t.x;
191243
const int * xs = (const int *) xs0 + (threadIdx.x % t.I) * stride + (threadIdx.x / t.I) * (t.J / 2);
192244
asm volatile("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
@@ -393,4 +445,60 @@ namespace ggml_cuda_mma {
393445
NO_DEVICE_CODE;
394446
#endif // NEW_MMA_AVAILABLE
395447
}
448+
449+
static __device__ __forceinline__ void mma(
450+
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
451+
#if defined(AMD_MFMA_AVAILABLE)
452+
using int32x4_t = __attribute__((__vector_size__(4 * sizeof(int)))) int;
453+
int32x4_t * acc = (int32x4_t *) D.x;
454+
#if defined(CDNA3)
455+
acc[0] = __builtin_amdgcn_mfma_i32_16x16x32_i8(((int64_t *) A.x)[0],
456+
((int64_t *) B.x)[0],
457+
acc[0],
458+
0, 0, 0);
459+
#elif defined(CDNA2) || defined(CDNA)
460+
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[0],
461+
B.x[0],
462+
acc[0],
463+
0, 0, 0);
464+
acc[0] = __builtin_amdgcn_mfma_i32_16x16x16i8(A.x[1],
465+
B.x[1],
466+
acc[0],
467+
0, 0, 0);
468+
#endif // defined(CDNA3)
469+
#else
470+
GGML_UNUSED(D);
471+
GGML_UNUSED(A);
472+
GGML_UNUSED(B);
473+
NO_DEVICE_CODE;
474+
#endif // AMD_MFMA_AVAILABLE
475+
}
476+
477+
static __device__ __forceinline__ void mma(
478+
tile<32, 32, int> & D, const tile<32, 4, int> & A, const tile<32, 4, int> & B) {
479+
#if defined(AMD_MFMA_AVAILABLE)
480+
using int32x16_t = __attribute__((__vector_size__(16 * sizeof(int)))) int;
481+
int32x16_t * acc = (int32x16_t *) D.x;
482+
#if defined(CDNA3)
483+
acc[0] = __builtin_amdgcn_mfma_i32_32x32x16_i8(((int64_t *) A.x)[0],
484+
((int64_t *) B.x)[0],
485+
acc[0],
486+
0, 0, 0);
487+
#elif defined(CDNA2) || defined(CDNA)
488+
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[0],
489+
B.x[0],
490+
acc[0],
491+
0, 0, 0);
492+
acc[0] = __builtin_amdgcn_mfma_i32_32x32x8i8(A.x[1],
493+
B.x[1],
494+
acc[0],
495+
0, 0, 0);
496+
#endif // defined(CDNA3)
497+
#else
498+
GGML_UNUSED(D);
499+
GGML_UNUSED(A);
500+
GGML_UNUSED(B);
501+
NO_DEVICE_CODE;
502+
#endif // AMD_MFMA_AVAILABLE
503+
}
396504
}

ggml/src/ggml-cuda/mmq.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,8 @@ void ggml_cuda_mul_mat_q(
109109
const int64_t s03 = src0->nb[3] / ts_src0;
110110
const int64_t s3 = dst->nb[3] / ts_dst;
111111

112-
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA;
112+
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
113+
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)));
113114

114115
if (!ids) {
115116
const size_t nbytes_src1_q8_1 = ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1 +
@@ -250,8 +251,9 @@ void ggml_cuda_op_mul_mat_q(
250251
// The stream-k decomposition is only faster for recent NVIDIA GPUs.
251252
// Also its fixup needs to allocate a temporary buffer in the memory pool.
252253
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
253-
const bool use_stream_k = GGML_CUDA_CC_IS_NVIDIA(cc) &&
254-
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && src1_ncols == ne11;
254+
const bool use_stream_k = ((GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA)
255+
|| (GGML_CUDA_CC_IS_AMD(cc) && GGML_CUDA_CC_IS_CDNA3(cc)))
256+
&& src1_ncols == ne11;
255257
const mmq_args args = {
256258
src0_dd_i, src0->type, (const int *) src1_ddq_i, nullptr, nullptr, dst_dd_i,
257259
ne00, row_diff, src1_ncols, stride01, ne11, nrows_dst,
@@ -306,7 +308,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
306308
return false;
307309
}
308310

309-
if (new_mma_available(cc)) {
311+
if (new_mma_available(cc) || amd_mfma_available(cc)) {
310312
return true;
311313
}
312314

0 commit comments

Comments
 (0)