3737#include " ggml-cuda/ssm-scan.cuh"
3838#include " ggml-cuda/sum.cuh"
3939#include " ggml-cuda/sumrows.cuh"
40+ #include " ggml-cuda/mean.cuh"
4041#include " ggml-cuda/tsembd.cuh"
4142#include " ggml-cuda/unary.cuh"
4243#include " ggml-cuda/upscale.cuh"
@@ -99,8 +100,7 @@ int ggml_cuda_get_device() {
99100static cudaError_t ggml_cuda_device_malloc (void ** ptr, size_t size, int device) {
100101 ggml_cuda_set_device (device);
101102 cudaError_t err;
102- if (getenv (" GGML_CUDA_ENABLE_UNIFIED_MEMORY" ) != nullptr )
103- {
103+ if (getenv (" GGML_CUDA_ENABLE_UNIFIED_MEMORY" ) != nullptr ) {
104104 err = cudaMallocManaged (ptr, size);
105105#if defined(GGML_USE_HIP)
106106 if (err == hipSuccess) {
@@ -118,9 +118,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
118118 err = cudaMalloc (ptr, size);
119119 }
120120#endif // defined(GGML_USE_HIP)
121- }
122- else
123- {
121+ } else {
124122 err = cudaMalloc (ptr, size);
125123 }
126124 return err;
@@ -1945,16 +1943,14 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19451943 && ggml_nbytes (src0) != ggml_backend_buffer_get_alloc_size (src0->buffer , src0) && src0->view_src ;
19461944
19471945 bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1948- && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
1949- && src0->ne [0 ] % 2 == 0 && src1->ne [1 ] == 1 ;
1946+ && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19501947 bool use_mul_mat_vec_q = ggml_is_quantized (src0->type ) && !bad_padding_clear
19511948 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
19521949 && src1->ne [1 ] <= MMVQ_MAX_BATCH_SIZE;
19531950 bool use_mul_mat_q = ggml_is_quantized (src0->type ) && !bad_padding_clear
19541951 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
19551952
1956- bool any_gpus_with_slow_fp16 = false ;
1957- bool any_gpus_without_fp16_mma = false ;
1953+ bool any_gpus_with_slow_fp16 = false ;
19581954
19591955 if (split) {
19601956 ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer ->buft ->context ;
@@ -1965,16 +1961,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19651961 continue ;
19661962 }
19671963
1968- const int cc = ggml_cuda_info ().devices [id].cc ;
1969- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1970- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || ! fast_fp16_hardware_available (cc );
1971- any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available (cc);
1964+ const int cc = ggml_cuda_info ().devices [id].cc ;
1965+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1966+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0-> type , cc, src0-> ne , src1-> ne [ 1 ] );
1967+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
19721968 }
19731969 } else {
1974- const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
1975- use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1976- any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || ! fast_fp16_hardware_available (cc );
1977- any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_hardware_available (cc);
1970+ const int cc = ggml_cuda_info ().devices [ctx.device ].cc ;
1971+ use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq (src0->type , cc, src1->ne [1 ]);
1972+ use_mul_mat_vec = use_mul_mat_vec && ggml_cuda_should_use_mmv (src0-> type , cc, src0-> ne , src1-> ne [ 1 ] );
1973+ any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available (cc);
19781974 }
19791975
19801976 // debug helpers
@@ -1985,7 +1981,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19851981 // printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
19861982 // printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
19871983
1988- if (!split && use_mul_mat_vec && (src0-> ne [ 1 ] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma) ) {
1984+ if (!split && use_mul_mat_vec) {
19891985 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
19901986 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
19911987 ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
@@ -2357,6 +2353,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
23572353 case GGML_OP_SUM_ROWS:
23582354 ggml_cuda_op_sum_rows (ctx, dst);
23592355 break ;
2356+ case GGML_OP_MEAN:
2357+ ggml_cuda_op_mean (ctx, dst);
2358+ break ;
23602359 case GGML_OP_SSM_CONV:
23612360 ggml_cuda_op_ssm_conv (ctx, dst);
23622361 break ;
@@ -3260,6 +3259,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32603259 case GGML_OP_POOL_2D:
32613260 case GGML_OP_SUM:
32623261 case GGML_OP_SUM_ROWS:
3262+ case GGML_OP_MEAN:
32633263 case GGML_OP_ARGSORT:
32643264 case GGML_OP_ACC:
32653265 return true ;
0 commit comments