@@ -1721,15 +1721,15 @@ static __global__ void k_compute_batched_ptrs(
17211721 size_t nb12, size_t nb13,
17221722 size_t nbd2, size_t nbd3,
17231723 int64_t r2, int64_t r3) {
1724- const int64_t i13 = blockIdx .x * blockDim .x + threadIdx .x ;
1725- const int64_t i12 = blockIdx .y * blockDim .y + threadIdx .y ;
1724+ int64_t i13 = blockIdx .x * blockDim .x + threadIdx .x ;
1725+ int64_t i12 = blockIdx .y * blockDim .y + threadIdx .y ;
17261726
17271727 if (i13 >= ne13 || i12 >= ne12) {
17281728 return ;
17291729 }
17301730
1731- const int64_t i03 = i13 / r3;
1732- const int64_t i02 = i12 / r2;
1731+ int64_t i03 = i13 / r3;
1732+ int64_t i02 = i12 / r2;
17331733
17341734 ptrs_src[0 *ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03;
17351735 ptrs_src[1 *ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13;
@@ -1743,10 +1743,6 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17431743 GGML_ASSERT (ggml_backend_buffer_is_cuda (src0->buffer ));
17441744 GGML_ASSERT (src0->type == GGML_TYPE_F16);
17451745
1746- // Byte offsets and tensor dimensions are currently used in an inconsistent way for dst.
1747- // As long as dst is contiguous this does not matter though.
1748- GGML_ASSERT (ggml_is_contiguous (dst));
1749-
17501746 GGML_TENSOR_BINARY_OP_LOCALS
17511747
17521748 const int64_t ne_dst = ggml_nelements (dst);
@@ -1755,31 +1751,21 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
17551751
17561752 CUBLAS_CHECK (cublasSetStream (ctx.cublas_handle (), main_stream));
17571753
1758- const half * src0_f16 = (const half *) src0->data ;
1759- float * dst_ddf = (float *) dst->data ;
1760-
1761- const half * src1_f16 = (const half *) src1->data ;
1762- const size_t ts_src1 = ggml_type_size (src1->type );
1763- GGML_ASSERT (nb10 == ts_src1);
1764- int64_t s11 = nb11 / ts_src1;
1765- int64_t s12 = nb12 / ts_src1;
1766- int64_t s13 = nb13 / ts_src1;
1767- ggml_cuda_pool_alloc<half> src1_f16_alloc (ctx.pool ());
1754+ void * src0_ddq = src0->data ;
1755+ half * src0_f16 = (half *) src0_ddq;
1756+ float * src1_ddf = (float *) src1->data ;
1757+ float * dst_ddf = (float *) dst->data ;
17681758
17691759 // convert src1 to fp16
1760+ ggml_cuda_pool_alloc<half> src1_f16_alloc (ctx.pool ());
17701761 if (src1->type != GGML_TYPE_F16) {
1771- const to_fp16_nc_cuda_t to_fp16_cuda = ggml_get_to_fp16_nc_cuda (src1->type );
1762+ const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda (src1->type );
17721763 const int64_t ne_src1 = ggml_nelements (src1);
17731764 src1_f16_alloc.alloc (ne_src1);
17741765 GGML_ASSERT (to_fp16_cuda != nullptr );
1775-
1776- to_fp16_cuda (src1_f16, src1_f16_alloc.get (), ne10, ne11, ne12, ne13, s11, s12, s13, main_stream);
1777-
1778- src1_f16 = src1_f16_alloc.get ();
1779- s11 = ne10;
1780- s12 = ne11*s11;
1781- s13 = ne12*s12;
1766+ to_fp16_cuda (src1_ddf, src1_f16_alloc.get (), ne_src1, main_stream);
17821767 }
1768+ half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get ();
17831769
17841770 ggml_cuda_pool_alloc<half> dst_f16 (ctx.pool ());
17851771 char * dst_t ;
@@ -1839,13 +1825,13 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18391825 int i02 = i12 / r2;
18401826
18411827 CUBLAS_CHECK(
1842- cublasGemmEx(ctx.cublas_handle() , CUBLAS_OP_T, CUBLAS_OP_N,
1843- ne01, ne11, ne10,
1844- alpha, (const char *) src0_f16 + i03*nb03 + i02*nb02 , CUDA_R_16F, nb01/sizeof(half),
1845- src1_f16 + i13*s13 + i12*s12, CUDA_R_16F, s11 ,
1846- beta, ( char *) dst_t + i13*nbd3 + i12*nbd2, cu_data_type, ne0 ,
1847- cu_compute_type,
1848- CUBLAS_GEMM_DEFAULT_TENSOR_OP));
1828+ cublasGemmEx(g_cublas_handles[g_main_device] , CUBLAS_OP_T, CUBLAS_OP_N,
1829+ ne01, ne11, ne10,
1830+ alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half),
1831+ (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float) ,
1832+ beta, ( char *) dst_t + i12*nbd2 + i13*nbd3, cu_data_type, ne01 ,
1833+ cu_compute_type,
1834+ CUBLAS_GEMM_DEFAULT_TENSOR_OP));
18491835 }
18501836 }
18511837 }
@@ -1856,15 +1842,15 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18561842 CUBLAS_CHECK (
18571843 cublasGemmStridedBatchedEx (ctx.cublas_handle (), CUBLAS_OP_T, CUBLAS_OP_N,
18581844 ne01, ne11, ne10,
1859- alpha, src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1860- src1_f16, CUDA_R_16F, s11, s12, // strideB
1861- beta, dst_t , cu_data_type, ne0 , ne1*ne0 , // strideC
1845+ alpha, ( const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA
1846+ ( const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB
1847+ beta, ( char *) dst_t , cu_data_type, ne01 , nb2/nb0 , // strideC
18621848 ne12*ne13,
18631849 cu_compute_type,
18641850 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
18651851 } else {
18661852 // use cublasGemmBatchedEx
1867- const int64_t ne23 = ne12*ne13;
1853+ const int ne23 = ne12*ne13;
18681854
18691855 ggml_cuda_pool_alloc<const void *> ptrs_src (ctx.pool (), 2 *ne23);
18701856 ggml_cuda_pool_alloc< void *> ptrs_dst (ctx.pool (), 1 *ne23);
@@ -1876,8 +1862,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18761862 ne12, ne13,
18771863 ne23,
18781864 nb02, nb03,
1879- src1->type == GGML_TYPE_F16 ? nb12 : s12* sizeof (half) ,
1880- src1->type == GGML_TYPE_F16 ? nb13 : s13* sizeof (half) ,
1865+ src1->type == GGML_TYPE_F16 ? nb12 : nb12/ 2 ,
1866+ src1->type == GGML_TYPE_F16 ? nb13 : nb13/ 2 ,
18811867 nbd2, nbd3,
18821868 r2, r3);
18831869 CUDA_CHECK (cudaGetLastError ());
@@ -1886,8 +1872,8 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18861872 cublasGemmBatchedEx (ctx.cublas_handle (), CUBLAS_OP_T, CUBLAS_OP_N,
18871873 ne01, ne11, ne10,
18881874 alpha, (const void **) (ptrs_src.get () + 0 *ne23), CUDA_R_16F, nb01/nb00,
1889- (const void **) (ptrs_src.get () + 1 *ne23), CUDA_R_16F, s11 ,
1890- beta, ( void **) (ptrs_dst.get () + 0 *ne23), cu_data_type, ne0 ,
1875+ (const void **) (ptrs_src.get () + 1 *ne23), CUDA_R_16F, nb11/nb10 ,
1876+ beta, ( void **) (ptrs_dst.get () + 0 *ne23), cu_data_type, ne01 ,
18911877 ne23,
18921878 cu_compute_type,
18931879 CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -1951,7 +1937,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19511937 } else if (!split && use_mul_mat_vec_q) {
19521938 ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
19531939 } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
1954- !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
1940+ dst-> op_params [ 0 ] == GGML_PREC_DEFAULT && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
19551941 // general KQ + KQV multi-batch without FlashAttention
19561942 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
19571943 } else if (use_mul_mat_vec) {
0 commit comments