@@ -1411,11 +1411,6 @@ static void ggml_cuda_op_mul_mat(
14111411 const int64_t ne0 = dst->ne [0 ];
14121412 const int64_t ne1 = dst->ne [1 ];
14131413
1414- // const int64_t nb10 = src1->nb[0];
1415- const int64_t nb11 = src1->nb [1 ];
1416- const int64_t nb12 = src1->nb [2 ];
1417- const int64_t nb13 = src1->nb [3 ];
1418-
14191414 const int64_t nb2 = dst->nb [2 ];
14201415 const int64_t nb3 = dst->nb [3 ];
14211416
@@ -1551,10 +1546,7 @@ static void ggml_cuda_op_mul_mat(
15511546 dev[id].src1_ddq = dev[id].src1_ddq_alloc .alloc (ctx.pool (id), src_1_ddq_size);
15521547
15531548 if (src1_on_device && src1_is_contiguous) {
1554- quantize_src1 (
1555- dev[id].src1_ddf , dev[id].src1_ddq , src0->type , ne10,
1556- nb11/sizeof (float ), nb12/sizeof (float ), nb13/sizeof (float ),
1557- src1_padded_col_size, ne11, ne12, ne13, stream);
1549+ quantize_src1 (dev[id].src1_ddf , dev[id].src1_ddq , ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type , stream);
15581550 CUDA_CHECK (cudaGetLastError ());
15591551 }
15601552 }
@@ -1649,9 +1641,7 @@ static void ggml_cuda_op_mul_mat(
16491641 }
16501642
16511643 if (quantize_src1 && !src1_is_contiguous) {
1652- quantize_src1 (
1653- src1_ddf_i, src1_ddq_i, src0->type , ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1654- src1_padded_col_size, src1_ncols, 1 , 1 , stream);
1644+ quantize_src1 (src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1 , src1_padded_col_size, src0->type , stream);
16551645 CUDA_CHECK (cudaGetLastError ());
16561646 }
16571647
@@ -1889,7 +1879,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18891879static void ggml_cuda_mul_mat (ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
18901880 const bool split = ggml_backend_buft_is_cuda_split (src0->buffer ->buft );
18911881
1892- bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0-> type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1882+ bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
18931883 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
18941884 && src0->ne [0 ] % 2 == 0 && src1->ne [1 ] == 1 ;
18951885 bool use_mul_mat_vec_q = ggml_is_quantized (src0->type )
@@ -1930,12 +1920,10 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19301920 // printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
19311921 // printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
19321922
1933- if (!split && use_mul_mat_vec && (src0->ne [1 ] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1923+ if (!split && use_mul_mat_vec && (src0->ne [1 ] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
19341924 // the custom F16 vector kernel can be used over batched cuBLAS GEMM
19351925 // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
1936- ggml_cuda_mul_mat_vec (ctx, src0, src1, nullptr , dst);
1937- } else if (!split && use_mul_mat_vec_q) {
1938- ggml_cuda_mul_mat_vec_q (ctx, src0, src1, nullptr , dst);
1926+ ggml_cuda_mul_mat_vec (ctx, src0, src1, dst);
19391927 } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
19401928 && !ggml_is_transposed (src0) && !ggml_is_transposed (src1) && src1->ne [2 ]*src1->ne [3 ] > 1 ) {
19411929 // general KQ + KQV multi-batch without FlashAttention
@@ -2012,15 +2000,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
20122000
20132001 GGML_TENSOR_BINARY_OP_LOCALS
20142002
2015- if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1 ) {
2016- if (ggml_is_quantized (src0->type )) {
2017- ggml_cuda_mul_mat_vec_q (ctx, src0, src1, ids, dst);
2018- } else {
2019- ggml_cuda_mul_mat_vec (ctx, src0, src1, ids, dst);
2020- }
2021- return ;
2022- }
2023-
20242003 GGML_ASSERT (!ggml_backend_buft_is_cuda_split (src0->buffer ->buft ) && " mul_mat_id does not support split buffers" );
20252004
20262005 cudaStream_t stream = ctx.stream ();
@@ -2057,75 +2036,97 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
20572036 dst_row.nb [2 ] = nb1;
20582037 dst_row.nb [3 ] = nb1;
20592038
2060- ggml_cuda_pool_alloc<char > src1_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (src1));
2061- ggml_cuda_pool_alloc<char > dst_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (dst));
2062-
2063- src1_row.data = src1_contiguous.get ();
2064- dst_row.data = dst_contiguous.get ();
2065-
2066- for (int64_t i02 = 0 ; i02 < n_as; i02++) {
2067- int64_t num_src1_rows = 0 ;
2068-
2039+ if (ne12 == 1 ) {
20692040 for (int64_t iid1 = 0 ; iid1 < ids->ne [1 ]; iid1++) {
20702041 for (int64_t id = 0 ; id < n_ids; id++) {
2071- const int32_t row_id_i = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
2042+ const int32_t i02 = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
20722043
2073- GGML_ASSERT (row_id_i >= 0 && row_id_i < n_as);
2044+ GGML_ASSERT (i02 >= 0 && i02 < n_as);
20742045
2075- if (row_id_i != i02) {
2076- continue ;
2077- }
2046+ const int64_t i11 = id % ne11;
2047+ const int64_t i12 = iid1;
2048+
2049+ const int64_t i1 = id;
2050+ const int64_t i2 = i12;
20782051
2079- num_src1_rows++;
2052+ src0_row.data = src0_original + i02*nb02;
2053+ src1_row.data = src1_original + i11*nb11 + i12*nb12;
2054+ dst_row.data = dst_original + i1*nb1 + i2*nb2;
2055+
2056+ ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
20802057 }
20812058 }
2059+ } else {
2060+ ggml_cuda_pool_alloc<char > src1_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (src1));
2061+ ggml_cuda_pool_alloc<char > dst_contiguous (ctx.pool (), sizeof (float )*ggml_nelements (dst));
20822062
2083- if (num_src1_rows == 0 ) {
2084- continue ;
2085- }
2063+ src1_row.data = src1_contiguous.get ();
2064+ dst_row.data = dst_contiguous.get ();
20862065
2087- ggml_cuda_pool_alloc<int > dev_cur_src1_row (ctx.pool (), 1 );
2088- ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping (ctx.pool (), num_src1_rows);
2089- CUDA_CHECK (cudaMemsetAsync (dev_cur_src1_row.get (), 0 , sizeof (int ), stream));
2066+ for (int64_t i02 = 0 ; i02 < n_as; i02++) {
2067+ int64_t num_src1_rows = 0 ;
20902068
2091- {
2092- dim3 block_dims (std::min ((unsigned int )ne10, 768u ));
2093- dim3 grid_dims (ids->ne [1 ], n_ids);
2094- k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2095- src1_original, src1_contiguous.get (),
2096- dev_cur_src1_row.get (), dev_row_mapping.get (),
2097- ids_dev, i02, ids->nb [1 ], ids->nb [0 ],
2098- ne11, ne10,
2099- nb11, nb12);
2100- CUDA_CHECK (cudaGetLastError ());
2101- }
2069+ for (int64_t iid1 = 0 ; iid1 < ids->ne [1 ]; iid1++) {
2070+ for (int64_t id = 0 ; id < n_ids; id++) {
2071+ const int32_t row_id_i = *(const int32_t *) (ids_host.data () + iid1*ids->nb [1 ] + id*ids->nb [0 ]);
21022072
2103- src0_row. data = src0_original + i02*nb02 ;
2073+ GGML_ASSERT (row_id_i >= 0 && row_id_i < n_as) ;
21042074
2105- GGML_ASSERT (nb11 == sizeof (float )*ne10);
2106- GGML_ASSERT (nb1 == sizeof (float )*ne0);
2075+ if (row_id_i != i02) {
2076+ continue ;
2077+ }
21072078
2108- src1_row.ne [1 ] = num_src1_rows;
2109- src1_row.nb [1 ] = nb11;
2110- src1_row.nb [2 ] = num_src1_rows*nb11;
2111- src1_row.nb [3 ] = num_src1_rows*nb11;
2079+ num_src1_rows++;
2080+ }
2081+ }
21122082
2113- dst_row.ne [1 ] = num_src1_rows;
2114- dst_row.nb [1 ] = nb1;
2115- dst_row.nb [2 ] = num_src1_rows*nb1;
2116- dst_row.nb [3 ] = num_src1_rows*nb1;
2083+ if (num_src1_rows == 0 ) {
2084+ continue ;
2085+ }
21172086
2118- ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
2087+ ggml_cuda_pool_alloc<int > dev_cur_src1_row (ctx.pool (), 1 );
2088+ ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping (ctx.pool (), num_src1_rows);
2089+ CUDA_CHECK (cudaMemsetAsync (dev_cur_src1_row.get (), 0 , sizeof (int ), stream));
21192090
2120- {
2121- dim3 block_dims (std::min ((unsigned int )ne0, 768u ));
2122- dim3 grid_dims (num_src1_rows);
2123- k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2124- dst_original, dst_contiguous.get (),
2125- dev_row_mapping.get (),
2126- ne0,
2127- nb1, nb2);
2128- CUDA_CHECK (cudaGetLastError ());
2091+ {
2092+ dim3 block_dims (std::min ((unsigned int )ne10, 768u ));
2093+ dim3 grid_dims (ids->ne [1 ], n_ids);
2094+ k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2095+ src1_original, src1_contiguous.get (),
2096+ dev_cur_src1_row.get (), dev_row_mapping.get (),
2097+ ids_dev, i02, ids->nb [1 ], ids->nb [0 ],
2098+ ne11, ne10,
2099+ nb11, nb12);
2100+ CUDA_CHECK (cudaGetLastError ());
2101+ }
2102+
2103+ src0_row.data = src0_original + i02*nb02;
2104+
2105+ GGML_ASSERT (nb11 == sizeof (float )*ne10);
2106+ GGML_ASSERT (nb1 == sizeof (float )*ne0);
2107+
2108+ src1_row.ne [1 ] = num_src1_rows;
2109+ src1_row.nb [1 ] = nb11;
2110+ src1_row.nb [2 ] = num_src1_rows*nb11;
2111+ src1_row.nb [3 ] = num_src1_rows*nb11;
2112+
2113+ dst_row.ne [1 ] = num_src1_rows;
2114+ dst_row.nb [1 ] = nb1;
2115+ dst_row.nb [2 ] = num_src1_rows*nb1;
2116+ dst_row.nb [3 ] = num_src1_rows*nb1;
2117+
2118+ ggml_cuda_mul_mat (ctx, &src0_row, &src1_row, &dst_row);
2119+
2120+ {
2121+ dim3 block_dims (std::min ((unsigned int )ne0, 768u ));
2122+ dim3 grid_dims (num_src1_rows);
2123+ k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0 , stream>>> (
2124+ dst_original, dst_contiguous.get (),
2125+ dev_row_mapping.get (),
2126+ ne0,
2127+ nb1, nb2);
2128+ CUDA_CHECK (cudaGetLastError ());
2129+ }
21292130 }
21302131 }
21312132}
@@ -2493,7 +2494,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
24932494#endif
24942495 }
24952496
2496- if (node->op == GGML_OP_MUL_MAT_ID && node-> ne [ 2 ] != 1 ) {
2497+ if (node->op == GGML_OP_MUL_MAT_ID) {
24972498 use_cuda_graph = false ; // This node type is not supported by CUDA graph capture
24982499#ifndef NDEBUG
24992500 GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to unsupported node type\n " , __func__);
@@ -3207,7 +3208,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32073208 }
32083209 case GGML_OP_ROPE:
32093210 case GGML_OP_ROPE_BACK: {
3210- return op->src [0 ]->nb [0 ] == ggml_type_size (op->src [0 ]->type ) && ggml_is_contiguous_2 (op->src [0 ]);
3211+ const size_t ts = ggml_type_size (op->src [0 ]->type );
3212+ const int64_t ne0_012 = op->src [0 ]->ne [0 ] * op->src [0 ]->ne [1 ] * op->src [0 ]->ne [2 ];
3213+ return op->src [0 ]->nb [0 ] == ts && op->src [0 ]->nb [3 ] == ne0_012*ts;
32113214 }
32123215 case GGML_OP_IM2COL:
32133216 case GGML_OP_POOL_2D:
0 commit comments