@@ -1629,11 +1629,6 @@ static void ggml_cuda_op_mul_mat(
16291629 const int64_t ne0 = dst->ne [0 ];
16301630 const int64_t ne1 = dst->ne [1 ];
16311631
1632- // const int64_t nb10 = src1->nb[0];
1633- const int64_t nb11 = src1->nb [1 ];
1634- const int64_t nb12 = src1->nb [2 ];
1635- const int64_t nb13 = src1->nb [3 ];
1636-
16371632 const int64_t nb2 = dst->nb [2 ];
16381633 const int64_t nb3 = dst->nb [3 ];
16391634
@@ -1768,10 +1763,7 @@ static void ggml_cuda_op_mul_mat(
17681763 dev[id].src1_ddq = dev[id].src1_ddq_alloc .alloc (ctx.pool (id), src_1_ddq_size);
17691764
17701765 if (src1_on_device && src1_is_contiguous) {
1771- quantize_src1 (
1772- dev[id].src1_ddf , dev[id].src1_ddq , src0->type , ne10,
1773- nb11/sizeof (float ), nb12/sizeof (float ), nb13/sizeof (float ),
1774- src1_padded_col_size, ne11, ne12, ne13, stream);
1766+ quantize_src1 (dev[id].src1_ddf , dev[id].src1_ddq , ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type , stream);
17751767 CUDA_CHECK (cudaGetLastError ());
17761768 }
17771769 }
@@ -1869,9 +1861,7 @@ static void ggml_cuda_op_mul_mat(
18691861 }
18701862
18711863 if (quantize_src1 && !src1_is_contiguous) {
1872- quantize_src1 (
1873- src1_ddf_i, src1_ddq_i, src0->type , ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1874- src1_padded_col_size, src1_ncols, 1 , 1 , stream);
1864+ quantize_src1 (src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1 , src1_padded_col_size, src0->type , stream);
18751865 CUDA_CHECK (cudaGetLastError ());
18761866 }
18771867
@@ -2165,7 +2155,7 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
21652155 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
21662156 && src0->ne [0 ] % (GGML_CUDA_DMMV_X*2 ) == 0 && src1->ne [1 ] == 1 ;
21672157
2168- bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0-> type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
2158+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
21692159 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
21702160 && src0->ne [0 ] % 2 == 0 && src1->ne [1 ] == 1 ;
21712161
@@ -2226,21 +2216,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
22262216 } else if (!split && use_mul_mat_vec && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
22272217 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
22282218 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
2229- ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
2230-
2231- } else if (!split && use_mul_mat_vec_q) {
2232- ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
2219+ ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
22332220
22342221 } else if (!split && src0->type == GGML_TYPE_F16 && src1->ne [1 ] == 1 && dst->ne [3 ] == 1 && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
2235- ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
2222+ ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
22362223
2237- // } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
2238- // dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
2239- } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
2240- && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
2224+ } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
2225+ dst->op_params [0 ] == GGML_PREC_DEFAULT && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
22412226 // general KQ + KQV multi-batch without FlashAttention
22422227 ggml_cuda_mul_mat_batched_cublas (ctx, src0, src1, dst);
2243-
22442228 } else if (use_dequantize_mul_mat_vec) {
22452229 ggml_cuda_op_mul_mat (ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr );
22462230 } else if (use_mul_mat_vec) {
@@ -2315,15 +2299,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
23152299
23162300 GGML_TENSOR_BINARY_OP_LOCALS
23172301
2318- if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1 ) {
2319- if (ggml_is_quantized (src0->type )) {
2320- ggml_cuda_mul_mat_vec_q (ctx, src0, src1, ids, dst);
2321- } else {
2322- ggml_cuda_mul_mat_vec (ctx, src0, src1, ids, dst);
2323- }
2324- return ;
2325- }
2326-
23272302 GGML_ASSERT (!ggml_backend_buft_is_cuda_split (src0->buffer ->buft ) && " mul_mat_id does not support split buffers" );
23282303
23292304 cudaStream_t stream = ctx.stream ();
@@ -2360,75 +2335,97 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
23602335 dst_row.nb [2 ] = nb1;
23612336 dst_row.nb [3 ] = nb1;
23622337
2363- ggml_cuda_pool_alloc<char > src1_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (src1));
2364- ggml_cuda_pool_alloc<char > dst_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (dst));
2365-
2366- src1_row.data = src1_contiguous.get ();
2367- dst_row.data = dst_contiguous.get ();
2368-
2369- for (int64_t i02 = 0 ; i02 < n_as; i02++) {
2370- int64_t num_src1_rows = 0 ;
2371-
2338+ if (ne12 == 1 ) {
23722339 for (int64_t iid1 = 0 ; iid1 < ids->ne [1 ]; iid1++) {
23732340 for (int64_t id = 0 ; id < n_ids; id++) {
2374- const int32_t row_id_i = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
2341+ const int32_t i02 = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
23752342
2376- GGML_ASSERT (row_id_i >= 0 && row_id_i < n_as);
2343+ GGML_ASSERT (i02 >= 0 && i02 < n_as);
23772344
2378- if (row_id_i != i02) {
2379- continue ;
2380- }
2345+ const int64_t i11 = id % ne11;
2346+ const int64_t i12 = iid1;
2347+
2348+ const int64_t i1 = id;
2349+ const int64_t i2 = i12;
2350+
2351+ src0_row.data = src0_original + i02*nb02;
2352+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
2353+ dst_row.data = dst_original + i1*nb1 + i2*nb2;
23812354
2382- num_src1_rows++ ;
2355+ ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row) ;
23832356 }
23842357 }
2358+ } else {
2359+ ggml_cuda_pool_alloc<char > src1_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (src1));
2360+ ggml_cuda_pool_alloc<char > dst_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (dst));
23852361
2386- if (num_src1_rows == 0 ) {
2387- continue ;
2388- }
2362+ src1_row.data = src1_contiguous.get ();
2363+ dst_row.data = dst_contiguous.get ();
23892364
2390- ggml_cuda_pool_alloc<int > dev_cur_src1_row (ctx.pool (), 1 );
2391- ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping (ctx.pool (), num_src1_rows);
2392- CUDA_CHECK (cudaMemsetAsync (dev_cur_src1_row.get (), 0 , sizeof (int ), stream));
2365+ for (int64_t i02 = 0 ; i02 < n_as; i02++) {
2366+ int64_t num_src1_rows = 0 ;
23932367
2394- {
2395- dim3 block_dims (std::min ((unsigned int )ne10, 768u ));
2396- dim3 grid_dims (ids->ne [1 ], n_ids);
2397- k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2398- src1_original, src1_contiguous.get (),
2399- dev_cur_src1_row.get (), dev_row_mapping.get (),
2400- ids_dev, i02, ids->nb [1 ], ids->nb [0 ],
2401- ne11, ne10,
2402- nb11, nb12);
2403- CUDA_CHECK (cudaGetLastError ());
2404- }
2368+ for (int64_t iid1 = 0 ; iid1 < ids->ne [1 ]; iid1++) {
2369+ for (int64_t id = 0 ; id < n_ids; id++) {
2370+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
24052371
2406- src0_row. data = src0_original + i02*nb02 ;
2372+ GGML_ASSERT (row_id_i >= 0 && row_id_i < n_as) ;
24072373
2408- GGML_ASSERT (nb11 == sizeof (float )*ne10);
2409- GGML_ASSERT (nb1 == sizeof (float )*ne0);
2374+ if (row_id_i != i02) {
2375+ continue ;
2376+ }
24102377
2411- src1_row.ne [1 ] = num_src1_rows;
2412- src1_row.nb [1 ] = nb11;
2413- src1_row.nb [2 ] = num_src1_rows*nb11;
2414- src1_row.nb [3 ] = num_src1_rows*nb11;
2378+ num_src1_rows++;
2379+ }
2380+ }
24152381
2416- dst_row.ne [1 ] = num_src1_rows;
2417- dst_row.nb [1 ] = nb1;
2418- dst_row.nb [2 ] = num_src1_rows*nb1;
2419- dst_row.nb [3 ] = num_src1_rows*nb1;
2382+ if (num_src1_rows == 0 ) {
2383+ continue ;
2384+ }
24202385
2421- ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
2386+ ggml_cuda_pool_alloc<int > dev_cur_src1_row (ctx.pool (), 1 );
2387+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping (ctx.pool (), num_src1_rows);
2388+ CUDA_CHECK (cudaMemsetAsync (dev_cur_src1_row.get (), 0 , sizeof (int ), stream));
24222389
2423- {
2424- dim3 block_dims (std::min ((unsigned int )ne0, 768u ));
2425- dim3 grid_dims (num_src1_rows);
2426- k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2427- dst_original, dst_contiguous.get (),
2428- dev_row_mapping.get (),
2429- ne0,
2430- nb1, nb2);
2431- CUDA_CHECK (cudaGetLastError ());
2390+ {
2391+ dim3 block_dims (std::min ((unsigned int )ne10, 768u ));
2392+ dim3 grid_dims (ids->ne [1 ], n_ids);
2393+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2394+ src1_original, src1_contiguous.get (),
2395+ dev_cur_src1_row.get (), dev_row_mapping.get (),
2396+ ids_dev, i02, ids->nb [1 ], ids->nb [0 ],
2397+ ne11, ne10,
2398+ nb11, nb12);
2399+ CUDA_CHECK (cudaGetLastError ());
2400+ }
2401+
2402+ src0_row.data = src0_original + i02*nb02;
2403+
2404+ GGML_ASSERT (nb11 == sizeof (float )*ne10);
2405+ GGML_ASSERT (nb1 == sizeof (float )*ne0);
2406+
2407+ src1_row.ne [1 ] = num_src1_rows;
2408+ src1_row.nb [1 ] = nb11;
2409+ src1_row.nb [2 ] = num_src1_rows*nb11;
2410+ src1_row.nb [3 ] = num_src1_rows*nb11;
2411+
2412+ dst_row.ne [1 ] = num_src1_rows;
2413+ dst_row.nb [1 ] = nb1;
2414+ dst_row.nb [2 ] = num_src1_rows*nb1;
2415+ dst_row.nb [3 ] = num_src1_rows*nb1;
2416+
2417+ ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
2418+
2419+ {
2420+ dim3 block_dims (std::min ((unsigned int )ne0, 768u ));
2421+ dim3 grid_dims (num_src1_rows);
2422+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2423+ dst_original, dst_contiguous.get (),
2424+ dev_row_mapping.get (),
2425+ ne0,
2426+ nb1, nb2);
2427+ CUDA_CHECK (cudaGetLastError ());
2428+ }
24322429 }
24332430 }
24342431}
@@ -2838,7 +2835,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
28382835#endif
28392836 }
28402837
2841- if (node->op == GGML_OP_MUL_MAT_ID && node-> ne [ 2 ] != 1 ) {
2838+ if (node->op == GGML_OP_MUL_MAT_ID) {
28422839 use_cuda_graph = false ; // This node type is not supported by CUDA graph capture
28432840#ifndef NDEBUG
28442841 GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to unsupported node type\n " , __func__);
@@ -3630,7 +3627,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
36303627 }
36313628 case GGML_OP_ROPE:
36323629 case GGML_OP_ROPE_BACK: {
3633- return op->src [0 ]->nb [0 ] == ggml_type_size (op->src [0 ]->type ) && ggml_is_contiguous_2 (op->src [0 ]);
3630+ const size_t ts = ggml_type_size (op->src [0 ]->type );
3631+ const int64_t ne0_012 = op->src [0 ]->ne [0 ] * op->src [0 ]->ne [1 ] * op->src [0 ]->ne [2 ];
3632+ return op->src [0 ]->nb [0 ] == ts && op->src [0 ]->nb [3 ] == ne0_012*ts;
36343633 }
36353634 case GGML_OP_IM2COL:
36363635 case GGML_OP_CONV_2D_DW:
0 commit comments