@@ -293,10 +293,13 @@ static constexpr __device__ int mmq_get_granularity_device(ggml_type type, const
293293}
294294#elif defined(NEW_MMA_AVAILABLE)
295295static constexpr __device__ int mmq_get_granularity_device (ggml_type type, const int mmq_x) {
296+ GGML_UNUSED (type);
296297 return mmq_x >= 48 ? 16 : 8 ;
297298}
298299#else
299300static constexpr __device__ int mmq_get_granularity_device (ggml_type type, const int mmq_x) {
301+ GGML_UNUSED (type);
302+ GGML_UNUSED (mmq_x);
300303 return 8 ;
301304}
302305#endif // AMD_MMA_AVAILABLE
@@ -367,6 +370,7 @@ static constexpr __device__ int get_mmq_nwarps_device(ggml_type type) {
367370}
368371#else
369372static constexpr __device__ int get_mmq_nwarps_device (ggml_type type) {
373+ GGML_UNUSED (type);
370374 return 8 ;
371375}
372376#endif // AMD_MMA_AVAILABLE
@@ -3564,8 +3568,8 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
35643568
35653569 const int nbytes_shared = mmq_get_nbytes_shared<type>(mmq_x, mmq_y, cc, warp_size, nwarps);
35663570
3567- CUDA_SET_SHARED_MEMORY_LIMIT ((mul_mat_q<type, mmq_x, MMQ_NWARPS, false >), nbytes_shared);
3568- CUDA_SET_SHARED_MEMORY_LIMIT ((mul_mat_q<type, mmq_x, MMQ_NWARPS, true >), nbytes_shared);
3571+ CUDA_SET_SHARED_MEMORY_LIMIT ((mul_mat_q<type, mmq_x, false >), nbytes_shared);
3572+ CUDA_SET_SHARED_MEMORY_LIMIT ((mul_mat_q<type, mmq_x, true >), nbytes_shared);
35693573
35703574 const int nty = (args.nrows_x + mmq_y - 1 ) / mmq_y;
35713575 const int ntx = (args.ncols_dst + mmq_x - 1 ) / mmq_x;
0 commit comments