Skip to content

Commit 21e3bdd

Browse files
Merge pull request #24 from menloresearch/update-dev-from-master-2025-03-24-00-08
Sync master with upstream release b4944
2 parents d9b0245 + 77f9c6b commit 21e3bdd

23 files changed

+435
-166
lines changed

docs/install.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@ brew install llama.cpp
99
```
1010
The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
1111

12+
## MacPorts
13+
14+
```sh
15+
sudo port install llama.cpp
16+
```
17+
see also: https://ports.macports.org/port/llama.cpp/details/
18+
1219
## Nix
1320

1421
On Mac and Linux, the Nix package manager can be used via

examples/server/server.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -830,6 +830,11 @@ struct server_task_result_cmpl_final : server_task_result {
830830
ret.push_back({"timings", timings.to_json()});
831831
}
832832

833+
// extra fields for debugging purposes
834+
if (verbose) {
835+
ret["__verbose"] = to_json_non_oaicompat();
836+
}
837+
833838
return ret;
834839
}
835840
};

examples/tts/tts.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,10 @@ int main(int argc, char ** argv) {
571571
model_ttc = llama_init_ttc.model.get();
572572
ctx_ttc = llama_init_ttc.context.get();
573573

574+
if (model_ttc == nullptr || ctx_ttc == nullptr) {
575+
return ENOENT;
576+
}
577+
574578
const llama_vocab * vocab = llama_model_get_vocab(model_ttc);
575579

576580
// TODO: refactor in a common struct
@@ -586,6 +590,10 @@ int main(int argc, char ** argv) {
586590
model_cts = llama_init_cts.model.get();
587591
ctx_cts = llama_init_cts.context.get();
588592

593+
if (model_cts == nullptr || ctx_cts == nullptr) {
594+
return ENOENT;
595+
}
596+
589597
std::vector<common_sampler *> smpl(n_parallel);
590598
for (int i = 0; i < n_parallel; ++i) {
591599
params.sampling.no_perf = (i != 0);

ggml/src/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,11 @@ if (GGML_CCACHE)
7676
set(GGML_CCACHE_VARIANT sccache)
7777
endif()
7878
# TODO: should not be set globally
79-
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
79+
if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
80+
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
81+
else ()
82+
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
83+
endif ()
8084
set(ENV{CCACHE_SLOPPINESS} time_macros)
8185
message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
8286
else()

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 31 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3110,17 +3110,17 @@ static void ggml_compute_forward_dup_same_cont(
31103110
const int ith = params->ith; // thread index
31113111
const int nth = params->nth; // number of threads
31123112

3113-
// parallelize by elements
3114-
const int ne = ggml_nelements(dst);
3115-
const int dr = (ne + nth - 1) / nth;
3116-
const int ie0 = dr * ith;
3117-
const int ie1 = MIN(ie0 + dr, ne);
3113+
// parallelize by blocks
3114+
const int nk = ggml_nelements(src0)/ggml_blck_size(src0->type);
3115+
const int dr = (nk + nth - 1) / nth;
3116+
const int k0 = dr * ith;
3117+
const int k1 = MIN(k0 + dr, nk);
31183118

3119-
if (ie0 < ie1) {
3119+
if (k0 < k1) {
31203120
memcpy(
3121-
((char *) dst->data + ie0*nb0),
3122-
((char *) src0->data + ie0*nb0),
3123-
(ie1 - ie0) * nb0);
3121+
((char *) dst->data + k0*nb0),
3122+
((char *) src0->data + k0*nb0),
3123+
(k1 - k0) * nb0);
31243124
}
31253125
}
31263126

@@ -4055,7 +4055,6 @@ static void ggml_compute_forward_dup_f32(
40554055
static void ggml_compute_forward_dup_bytes(
40564056
const struct ggml_compute_params * params,
40574057
struct ggml_tensor * dst) {
4058-
40594058
const struct ggml_tensor * src0 = dst->src[0];
40604059

40614060
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
@@ -4069,10 +4068,10 @@ static void ggml_compute_forward_dup_bytes(
40694068
}
40704069

40714070
const size_t type_size = ggml_type_size(src0->type);
4071+
40724072
const int ith = params->ith; // thread index
40734073
const int nth = params->nth; // number of threads
40744074

4075-
40764075
// parallelize by rows
40774076
const int nr = ne01;
40784077
// number of rows per thread
@@ -4082,10 +4081,10 @@ static void ggml_compute_forward_dup_bytes(
40824081
const int ir1 = MIN(ir0 + dr, nr);
40834082

40844083
if (src0->type == dst->type &&
4085-
ne00 == ne0 &&
4084+
ggml_are_same_shape(src0, dst) &&
40864085
nb00 == type_size && nb0 == type_size) {
40874086
// copy by rows
4088-
const size_t rs = ne00 * type_size;
4087+
const size_t rs = ggml_row_size(src0->type, ne00);
40894088
for (int64_t i03 = 0; i03 < ne03; i03++) {
40904089
for (int64_t i02 = 0; i02 < ne02; i02++) {
40914090
for (int64_t i01 = ir0; i01 < ir1; i01++) {
@@ -4140,17 +4139,20 @@ static void ggml_compute_forward_dup_bytes(
41404139
}
41414140

41424141
// dst counters
4143-
4144-
int64_t i10 = 0;
4142+
int64_t k10 = 0;
41454143
int64_t i11 = 0;
41464144
int64_t i12 = 0;
41474145
int64_t i13 = 0;
41484146

4147+
// number of blocks in a row
4148+
const int64_t nk00 = ne00 / ggml_blck_size(src0->type);
4149+
const int64_t nk0 = ne0 / ggml_blck_size(dst->type);
4150+
41494151
for (int64_t i03 = 0; i03 < ne03; i03++) {
41504152
for (int64_t i02 = 0; i02 < ne02; i02++) {
4151-
i10 += ne00 * ir0;
4152-
while (i10 >= ne0) {
4153-
i10 -= ne0;
4153+
k10 += nk00 * ir0;
4154+
while (k10 >= nk0) {
4155+
k10 -= nk0;
41544156
if (++i11 == ne1) {
41554157
i11 = 0;
41564158
if (++i12 == ne2) {
@@ -4162,14 +4164,14 @@ static void ggml_compute_forward_dup_bytes(
41624164
}
41634165
}
41644166
for (int64_t i01 = ir0; i01 < ir1; i01++) {
4165-
for (int64_t i00 = 0; i00 < ne00; i00++) {
4166-
const char * src0_ptr = ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4167-
char * dst_ptr = ((char *) dst->data + i10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
4167+
for (int64_t k00 = 0; k00 < nk00; k00++) {
4168+
const char * src0_ptr = ((char *) src0->data + k00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);
4169+
char * dst_ptr = ((char *) dst->data + k10*nb0 + i11*nb1 + i12*nb2 + i13*nb3);
41684170

41694171
memcpy(dst_ptr, src0_ptr, type_size);
41704172

4171-
if (++i10 == ne0) {
4172-
i10 = 0;
4173+
if (++k10 == nk0) {
4174+
k10 = 0;
41734175
if (++i11 == ne1) {
41744176
i11 = 0;
41754177
if (++i12 == ne2) {
@@ -4182,9 +4184,9 @@ static void ggml_compute_forward_dup_bytes(
41824184
}
41834185
}
41844186
}
4185-
i10 += ne00 * (ne01 - ir1);
4186-
while (i10 >= ne0) {
4187-
i10 -= ne0;
4187+
k10 += nk00 * (ne01 - ir1);
4188+
while (k10 >= nk0) {
4189+
k10 -= nk0;
41884190
if (++i11 == ne1) {
41894191
i11 = 0;
41904192
if (++i12 == ne2) {
@@ -14308,7 +14310,9 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
1430814310
}
1430914311

1431014312
// extra_buffer op?
14311-
if (ggml_cpu_extra_compute_forward(params, tensor)) return;
14313+
if (ggml_cpu_extra_compute_forward(params, tensor)) {
14314+
return;
14315+
}
1431214316

1431314317
switch (tensor->op) {
1431414318
case GGML_OP_DUP:

ggml/src/ggml-cuda/common.cuh

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -41,14 +41,17 @@
4141
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
4242
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
4343

44-
#define GGML_CUDA_CC_PASCAL 600
45-
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46-
#define GGML_CUDA_CC_VOLTA 700
47-
#define GGML_CUDA_CC_TURING 750
48-
#define GGML_CUDA_CC_AMPERE 800
49-
#define GGML_CUDA_CC_ADA_LOVELACE 890
50-
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
51-
44+
#define GGML_CUDA_CC_PASCAL 600
45+
#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
46+
#define GGML_CUDA_CC_VOLTA 700
47+
#define GGML_CUDA_CC_TURING 750
48+
#define GGML_CUDA_CC_AMPERE 800
49+
#define GGML_CUDA_CC_ADA_LOVELACE 890
50+
#define GGML_CUDA_CC_OFFSET_AMD 0x1000000
51+
#define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
52+
#define GGML_CUDA_CC_IS_NVIDIA(cc) (cc < GGML_CUDA_CC_OFFSET_MTHREADS)
53+
54+
// AMD
5255
// GCN/CNDA, wave size is 64
5356
#define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
5457
#define GGML_CUDA_CC_VEGA (GGML_CUDA_CC_OFFSET_AMD + 0x900) // Vega56/64, minimum for fp16 dual issue
@@ -70,8 +73,17 @@
7073
#define GGML_CUDA_CC_IS_GCN(cc) (cc > GGML_CUDA_CC_OFFSET_AMD && cc < GGML_CUDA_CC_CDNA)
7174
#define GGML_CUDA_CC_IS_CDNA(cc) (cc >= GGML_CUDA_CC_CDNA && cc < GGML_CUDA_CC_RDNA1)
7275

73-
#define GGML_CUDA_CC_QY1 210
74-
#define GGML_CUDA_CC_QY2 220
76+
// Moore Threads
77+
#define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
78+
79+
#define GGML_CUDA_CC_QY1 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
80+
#define GGML_CUDA_CC_QY2 (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
81+
#define GGML_CUDA_CC_NG (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
82+
83+
#define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
84+
#define GGML_CUDA_CC_IS_QY1(cc) (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
85+
#define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
86+
#define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG)
7587

7688
#ifdef __CUDA_ARCH_LIST__
7789
constexpr bool ggml_cuda_has_arch_impl(int) {
@@ -209,42 +221,42 @@ typedef float2 dfloat2;
209221
#define CP_ASYNC_AVAILABLE
210222
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
211223

212-
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
224+
#if !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
213225
#define FLASH_ATTN_AVAILABLE
214-
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
226+
#endif // !defined(GGML_CUDA_NO_FA) && !(defined(GGML_USE_MUSA) && GGML_CUDA_MUSA_ARCH_IS_QY1)
215227

216228
static bool fp16_available(const int cc) {
217229
return ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_PASCAL;
218230
}
219231

220232
static bool fast_fp16_available(const int cc) {
221-
return fp16_available(cc) && cc != 610;
233+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && fp16_available(cc) && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
222234
}
223235

224236
// To be used for feature selection of external libraries, e.g. cuBLAS.
225237
static bool fast_fp16_hardware_available(const int cc) {
226-
return cc >= GGML_CUDA_CC_PASCAL && cc != 610;
238+
return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_PASCAL && cc != 610) || GGML_CUDA_CC_IS_AMD(cc);
227239
}
228240

229241
// Any FP16 tensor core instructions are available for ggml code.
230242
static bool fp16_mma_available(const int cc) {
231243
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
232244
return false;
233245
#else
234-
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
235-
GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
246+
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ||
247+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
236248
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(GGML_HIP_ROCWMMA_FATTN)
237249
}
238250

239251
// To be used for feature selection of external libraries, e.g. cuBLAS.
240252
static bool fp16_mma_hardware_available(const int cc) {
241-
return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_VOLTA ||
242-
GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
253+
return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_VOLTA ||
254+
GGML_CUDA_CC_IS_CDNA(cc) || GGML_CUDA_CC_IS_RDNA3(cc);
243255
}
244256

245257
// Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
246258
static bool new_mma_available(const int cc) {
247-
return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
259+
return GGML_CUDA_CC_IS_NVIDIA(cc) && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
248260
}
249261

250262
static bool cp_async_available(const int cc) {

ggml/src/ggml-cuda/fattn.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -253,7 +253,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
253253
const int warp_size = ggml_cuda_info().devices[ggml_cuda_get_device()].warp_size;
254254
const enum ggml_prec prec = ggml_flash_attn_ext_get_prec(KQV);
255255

256-
if (cc >= GGML_CUDA_CC_OFFSET_AMD) {
256+
if (GGML_CUDA_CC_IS_AMD(cc)) {
257257
#if defined(GGML_HIP_ROCWMMA_FATTN)
258258
if (fp16_mma_available(cc)) {
259259
ggml_cuda_flash_attn_ext_wmma_f16(ctx, dst);

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -264,9 +264,9 @@ static ggml_cuda_device_info ggml_cuda_init() {
264264
#elif defined(GGML_USE_MUSA)
265265
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
266266
info.devices[id].warp_size = 32;
267-
// TODO: refine the .cc to reflect MUSA's actual CC capabilities
268267
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
269-
info.devices[id].cc = 100*prop.major + 10*prop.minor;
268+
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
269+
info.devices[id].cc += prop.minor * 0x10;
270270
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
271271
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
272272
#else
@@ -1188,11 +1188,11 @@ static void ggml_cuda_op_mul_mat_cublas(
11881188
// ldc == nrows of the matrix that cuBLAS writes into
11891189
int64_t ldc = id == ctx.device ? ne0 : row_diff;
11901190

1191-
const int compute_capability = ggml_cuda_info().devices[id].cc;
1191+
const int cc = ggml_cuda_info().devices[id].cc;
11921192

11931193
const bool use_fp16 = (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT;
11941194

1195-
if (compute_capability >= GGML_CUDA_CC_VOLTA && use_fp16) {
1195+
if (((cc >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) || GGML_CUDA_CC_IS_AMD(cc)) && use_fp16) {
11961196
// convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
11971197
ggml_cuda_pool_alloc<half> src0_as_f16(ctx.pool(id));
11981198
if (src0->type != GGML_TYPE_F16) {
@@ -1216,7 +1216,7 @@ static void ggml_cuda_op_mul_mat_cublas(
12161216

12171217
CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream));
12181218

1219-
if (GGML_CUDA_CC_IS_CDNA(compute_capability)) {
1219+
if (GGML_CUDA_CC_IS_CDNA(cc)) {
12201220
const float alpha = 1.0f;
12211221
const float beta = 0.0f;
12221222
CUBLAS_CHECK(

ggml/src/ggml-cuda/mmq.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ void ggml_cuda_op_mul_mat_q(
2828
// Also its fixup needs to allocate a temporary buffer in the memory pool.
2929
// There are multiple parallel CUDA streams for src1_ncols != ne11 which would introduce a race condition for this buffer.
3030
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA &&
31-
cc < GGML_CUDA_CC_OFFSET_AMD && src1_ncols == ne11;
31+
GGML_CUDA_CC_IS_NVIDIA(cc) && src1_ncols == ne11;
3232
const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst, use_stream_k};
3333

3434
switch (src0->type) {
@@ -145,7 +145,7 @@ bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
145145
return true;
146146
#endif //GGML_CUDA_FORCE_MMQ
147147

148-
if (cc < GGML_CUDA_CC_OFFSET_AMD) {
148+
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {
149149
return !fp16_mma_hardware_available(cc) || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
150150
}
151151

ggml/src/ggml-cuda/mmq.cuh

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ struct tile_x_sizes {
9090

9191
static int get_mmq_x_max_host(const int cc) {
9292
return new_mma_available(cc) ? 128 :
93-
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD ?
93+
ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc) ?
9494
#ifdef GGML_CUDA_FORCE_MMQ
9595
128 : 64;
9696
#else
@@ -123,8 +123,8 @@ static constexpr __device__ int get_mmq_x_max_device() {
123123
}
124124

125125
static int get_mmq_y_host(const int cc) {
126-
return cc >= GGML_CUDA_CC_OFFSET_AMD ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
127-
(ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA ? 128 : 64);
126+
return GGML_CUDA_CC_IS_AMD(cc) ? (GGML_CUDA_CC_IS_RDNA1(cc) ? 64 : 128) :
127+
((ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc)) ? 128 : 64);
128128
}
129129

130130
static constexpr __device__ int get_mmq_y_device() {
@@ -2772,14 +2772,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
27722772

27732773
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
27742774

2775-
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
2775+
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
27762776
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
27772777
if (!shmem_limit_raised[id]) {
27782778
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
27792779
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
27802780
shmem_limit_raised[id] = true;
27812781
}
2782-
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
2782+
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && !defined(GGML_USE_MUSA)
27832783

27842784
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
27852785
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
@@ -2832,7 +2832,7 @@ void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cuda
28322832
const int mmq_x_max = get_mmq_x_max_host(cc);
28332833
const int mmq_y = get_mmq_y_host(cc);
28342834
const int block_num_y = (args.ne01 + mmq_y - 1) / mmq_y;
2835-
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && cc < GGML_CUDA_CC_OFFSET_AMD;
2835+
const bool use_stream_k = ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_VOLTA && GGML_CUDA_CC_IS_NVIDIA(cc);
28362836

28372837
int mmq_x_best = 0;
28382838
int nparts_best = INT_MAX;

0 commit comments

Comments
 (0)