@@ -1720,15 +1720,15 @@ static __global__ void k_compute_batched_ptrs(
17201720 size_t nb12, size_t nb13,
17211721 size_t nbd2, size_t nbd3,
17221722 int64_t r2, int64_t r3) {
1723- int64_t i13 = blockIdx .x * blockDim .x + threadIdx .x ;
1724- int64_t i12 = blockIdx .y * blockDim .y + threadIdx .y ;
1723+ const int64_t i13 = blockIdx .x * blockDim .x + threadIdx .x ;
1724+ const int64_t i12 = blockIdx .y * blockDim .y + threadIdx .y ;
17251725
17261726 if (i13 >= ne13 || i12 >= ne12) {
17271727 return ;
17281728 }
17291729
1730- int64_t i03 = i13 / r3;
1731- int64_t i02 = i12 / r2;
1730+ const int64_t i03 = i13 / r3;
1731+ const int64_t i02 = i12 / r2;
17321732
17331733 ptrs_src[0 *ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
17341734 ptrs_src[1 *ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
@@ -1742,6 +1742,10 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17421742 GGML_ASSERT (ggml_backend_buffer_is_cuda (src0->buffer ));
17431743 GGML_ASSERT (src0->type == GGML_TYPE_F16);
17441744
1745+ // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1746+ // As long as dst is contiguous this does not matter though.
1747+ GGML_ASSERT (ggml_is_contiguous (dst));
1748+
17451749 GGML_TENSOR_BINARY_OP_LOCALS
17461750
17471751 const int64_t ne_dst = ggml_nelements (dst);
@@ -1750,21 +1754,31 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17501754
17511755 CUBLAS_CHECK (cublasSetStream (ctx.cublas_handle (), main_stream));
17521756
1753- void * src0_ddq = src0->data ;
1754- half * src0_f16 = (half *) src0_ddq;
1755- float * src1_ddf = (float *) src1->data ;
1756- float * dst_ddf = (float *) dst->data ;
1757+ const half * src0_f16 = (const half *) src0->data ;
1758+ float * dst_ddf = (float *) dst->data ;
17571759
1758- // convert src1 to fp16
1760+ const half * src1_f16 = (const half *) src1->data ;
1761+ const size_t ts_src1 = ggml_type_size (src1->type );
1762+ GGML_ASSERT (nb10 == ts_src1);
1763+ int64_t s11 = nb11 / ts_src1;
1764+ int64_t s12 = nb12 / ts_src1;
1765+ int64_t s13 = nb13 / ts_src1;
17591766 ggml_cuda_pool_alloc<half> src1_f16_alloc (ctx.pool ());
1767+
1768+ // convert src1 to fp16
17601769 if (src1->type != GGML_TYPE_F16) {
1761- const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
1770+ const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda (src1->type );
17621771 const int64_t ne_src1 = ggml_nelements (src1);
17631772 src1_f16_alloc.alloc (ne_src1);
17641773 GGML_ASSERT (to_fp16_cuda != nullptr );
1765- to_fp16_cuda (src1_ddf, src1_f16_alloc.get (), ne_src1, main_stream);
1774+
1775+ to_fp16_cuda (src1_f16, src1_f16_alloc.get (), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1776+
1777+ src1_f16 = src1_f16_alloc.get ();
1778+ s11 = ne10;
1779+ s12 = ne11*s11;
1780+ s13 = ne12*s12;
17661781 }
1767- half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get ();
17681782
17691783 ggml_cuda_pool_alloc<half> dst_f16 (ctx.pool ());
17701784 char * dst_t ;
@@ -1824,13 +1838,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18241838 int i02 = i12 / r2;
18251839
18261840 CUBLAS_CHECK(
1827- cublasGemmEx(g_cublas_handles[g_main_device] , CUBLAS_OP_T, CUBLAS_OP_N,
1828- ne01, ne11, ne10,
1829- alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
1830- (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float) ,
1831- beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01 ,
1832- cu_compute_type,
1833- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1841+ cublasGemmEx(ctx.cublas_handle() , CUBLAS_OP_T, CUBLAS_OP_N,
1842+ ne01, ne11, ne10,
1843+ alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02 , CUDA_R_16F, nb01/sizeof(half),
1844+ src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11 ,
1845+ beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0 ,
1846+ cu_compute_type,
1847+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
18341848 }
18351849 }
18361850 }
@@ -1841,15 +1855,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18411855 CUBLAS_CHECK (
18421856 cublasGemmStridedBatchedEx (ctx.cublas_handle (), CUBLAS_OP_T, CUBLAS_OP_N,
18431857 ne01, ne11, ne10,
1844- alpha, ( const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1845- ( const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
1846- beta, ( char *) dst_t , cu_data_type, ne01 , nb2/nb0 , // strideC
1858+ alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1859+ src1_f16, CUDA_R_16F, s11, s12, // strideB
1860+ beta, dst_t , cu_data_type, ne0 , ne1*ne0 , // strideC
18471861 ne12*ne13,
18481862 cu_compute_type,
18491863 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
18501864 } else {
18511865 // use cublasGemmBatchedEx
1852- const int ne23 = ne12*ne13;
1866+ const int64_t ne23 = ne12*ne13;
18531867
18541868 ggml_cuda_pool_alloc<const void *> ptrs_src (ctx.pool (), 2 *ne23);
18551869 ggml_cuda_pool_alloc< void *> ptrs_dst (ctx.pool (), 1 *ne23);
@@ -1861,8 +1875,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18611875 ne12, ne13,
18621876 ne23,
18631877 nb02, nb03,
1864- src1->type == GGML_TYPE_F16 ? nb12 : nb12/ 2 ,
1865- src1->type == GGML_TYPE_F16 ? nb13 : nb13/ 2 ,
1878+ src1->type == GGML_TYPE_F16 ? nb12 : s12* sizeof (half) ,
1879+ src1->type == GGML_TYPE_F16 ? nb13 : s13* sizeof (half) ,
18661880 nbd2, nbd3,
18671881 r2, r3);
18681882 CUDA_CHECK (cudaGetLastError ());
@@ -1871,8 +1885,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18711885 cublasGemmBatchedEx (ctx.cublas_handle (), CUBLAS_OP_T, CUBLAS_OP_N,
18721886 ne01, ne11, ne10,
18731887 alpha, (const void **) (ptrs_src.get () + 0 *ne23), CUDA_R_16F, nb01/nb00,
1874- (const void **) (ptrs_src.get () + 1 *ne23), CUDA_R_16F, nb11/nb10 ,
1875- beta, ( void **) (ptrs_dst.get () + 0 *ne23), cu_data_type, ne01 ,
1888+ (const void **) (ptrs_src.get () + 1 *ne23), CUDA_R_16F, s11 ,
1889+ beta, ( void **) (ptrs_dst.get () + 0 *ne23), cu_data_type, ne0 ,
18761890 ne23,
18771891 cu_compute_type,
18781892 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1936,7 +1950,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19361950 } else if (!split && use_mul_mat_vec_q) {
19371951 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
19381952 } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1939- dst-> op_params [ 0 ] == GGML_PREC_DEFAULT && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
1953+ !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
19401954 // general KQ + KQV multi-batch without FlashAttention
19411955 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
19421956 } else if (use_mul_mat_vec) {
0 commit comments