Skip to content

Commit 91ca5ab

Browse files
committed
Revert commit "CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID (ggml-org#13014)"
1 parent 25e747e commit 91ca5ab

File tree

8 files changed

+418
-540
lines changed

8 files changed

+418
-540
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 84 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1411,11 +1411,6 @@ static void ggml_cuda_op_mul_mat(
14111411
const int64_t ne0 = dst->ne[0];
14121412
const int64_t ne1 = dst->ne[1];
14131413

1414-
// const int64_t nb10 = src1->nb[0];
1415-
const int64_t nb11 = src1->nb[1];
1416-
const int64_t nb12 = src1->nb[2];
1417-
const int64_t nb13 = src1->nb[3];
1418-
14191414
const int64_t nb2 = dst->nb[2];
14201415
const int64_t nb3 = dst->nb[3];
14211416

@@ -1551,10 +1546,7 @@ static void ggml_cuda_op_mul_mat(
15511546
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
15521547

15531548
if (src1_on_device && src1_is_contiguous) {
1554-
quantize_src1(
1555-
dev[id].src1_ddf, dev[id].src1_ddq, src0->type, ne10,
1556-
nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
1557-
src1_padded_col_size, ne11, ne12, ne13, stream);
1549+
quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
15581550
CUDA_CHECK(cudaGetLastError());
15591551
}
15601552
}
@@ -1649,9 +1641,7 @@ static void ggml_cuda_op_mul_mat(
16491641
}
16501642

16511643
if (quantize_src1 && !src1_is_contiguous) {
1652-
quantize_src1(
1653-
src1_ddf_i, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
1654-
src1_padded_col_size, src1_ncols, 1, 1, stream);
1644+
quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
16551645
CUDA_CHECK(cudaGetLastError());
16561646
}
16571647

@@ -1889,7 +1879,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
18891879
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
18901880
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
18911881

1892-
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
1882+
bool use_mul_mat_vec = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
18931883
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
18941884
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
18951885
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
@@ -1930,12 +1920,10 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
19301920
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
19311921
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
19321922

1933-
if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
1923+
if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
19341924
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
19351925
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
1936-
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
1937-
} else if (!split && use_mul_mat_vec_q) {
1938-
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
1926+
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
19391927
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
19401928
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
19411929
// general KQ + KQV multi-batch without FlashAttention
@@ -2012,15 +2000,6 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
20122000

20132001
GGML_TENSOR_BINARY_OP_LOCALS
20142002

2015-
if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1) {
2016-
if (ggml_is_quantized(src0->type)) {
2017-
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
2018-
} else {
2019-
ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
2020-
}
2021-
return;
2022-
}
2023-
20242003
GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
20252004

20262005
cudaStream_t stream = ctx.stream();
@@ -2057,75 +2036,97 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
20572036
dst_row.nb[2] = nb1;
20582037
dst_row.nb[3] = nb1;
20592038

2060-
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
2061-
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
2062-
2063-
src1_row.data = src1_contiguous.get();
2064-
dst_row.data = dst_contiguous.get();
2065-
2066-
for (int64_t i02 = 0; i02 < n_as; i02++) {
2067-
int64_t num_src1_rows = 0;
2068-
2039+
if (ne12 == 1) {
20692040
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
20702041
for (int64_t id = 0; id < n_ids; id++) {
2071-
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
2042+
const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
20722043

2073-
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
2044+
GGML_ASSERT(i02 >= 0 && i02 < n_as);
20742045

2075-
if (row_id_i != i02) {
2076-
continue;
2077-
}
2046+
const int64_t i11 = id % ne11;
2047+
const int64_t i12 = iid1;
2048+
2049+
const int64_t i1 = id;
2050+
const int64_t i2 = i12;
20782051

2079-
num_src1_rows++;
2052+
src0_row.data = src0_original + i02*nb02;
2053+
src1_row.data = src1_original + i11*nb11 + i12*nb12;
2054+
dst_row.data = dst_original + i1*nb1 + i2*nb2;
2055+
2056+
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
20802057
}
20812058
}
2059+
} else {
2060+
ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
2061+
ggml_cuda_pool_alloc<char> dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
20822062

2083-
if (num_src1_rows == 0) {
2084-
continue;
2085-
}
2063+
src1_row.data = src1_contiguous.get();
2064+
dst_row.data = dst_contiguous.get();
20862065

2087-
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2088-
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2089-
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
2066+
for (int64_t i02 = 0; i02 < n_as; i02++) {
2067+
int64_t num_src1_rows = 0;
20902068

2091-
{
2092-
dim3 block_dims(std::min((unsigned int)ne10, 768u));
2093-
dim3 grid_dims(ids->ne[1], n_ids);
2094-
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2095-
src1_original, src1_contiguous.get(),
2096-
dev_cur_src1_row.get(), dev_row_mapping.get(),
2097-
ids_dev, i02, ids->nb[1], ids->nb[0],
2098-
ne11, ne10,
2099-
nb11, nb12);
2100-
CUDA_CHECK(cudaGetLastError());
2101-
}
2069+
for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
2070+
for (int64_t id = 0; id < n_ids; id++) {
2071+
const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
21022072

2103-
src0_row.data = src0_original + i02*nb02;
2073+
GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
21042074

2105-
GGML_ASSERT(nb11 == sizeof(float)*ne10);
2106-
GGML_ASSERT(nb1 == sizeof(float)*ne0);
2075+
if (row_id_i != i02) {
2076+
continue;
2077+
}
21072078

2108-
src1_row.ne[1] = num_src1_rows;
2109-
src1_row.nb[1] = nb11;
2110-
src1_row.nb[2] = num_src1_rows*nb11;
2111-
src1_row.nb[3] = num_src1_rows*nb11;
2079+
num_src1_rows++;
2080+
}
2081+
}
21122082

2113-
dst_row.ne[1] = num_src1_rows;
2114-
dst_row.nb[1] = nb1;
2115-
dst_row.nb[2] = num_src1_rows*nb1;
2116-
dst_row.nb[3] = num_src1_rows*nb1;
2083+
if (num_src1_rows == 0) {
2084+
continue;
2085+
}
21172086

2118-
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2087+
ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
2088+
ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
2089+
CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
21192090

2120-
{
2121-
dim3 block_dims(std::min((unsigned int)ne0, 768u));
2122-
dim3 grid_dims(num_src1_rows);
2123-
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2124-
dst_original, dst_contiguous.get(),
2125-
dev_row_mapping.get(),
2126-
ne0,
2127-
nb1, nb2);
2128-
CUDA_CHECK(cudaGetLastError());
2091+
{
2092+
dim3 block_dims(std::min((unsigned int)ne10, 768u));
2093+
dim3 grid_dims(ids->ne[1], n_ids);
2094+
k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2095+
src1_original, src1_contiguous.get(),
2096+
dev_cur_src1_row.get(), dev_row_mapping.get(),
2097+
ids_dev, i02, ids->nb[1], ids->nb[0],
2098+
ne11, ne10,
2099+
nb11, nb12);
2100+
CUDA_CHECK(cudaGetLastError());
2101+
}
2102+
2103+
src0_row.data = src0_original + i02*nb02;
2104+
2105+
GGML_ASSERT(nb11 == sizeof(float)*ne10);
2106+
GGML_ASSERT(nb1 == sizeof(float)*ne0);
2107+
2108+
src1_row.ne[1] = num_src1_rows;
2109+
src1_row.nb[1] = nb11;
2110+
src1_row.nb[2] = num_src1_rows*nb11;
2111+
src1_row.nb[3] = num_src1_rows*nb11;
2112+
2113+
dst_row.ne[1] = num_src1_rows;
2114+
dst_row.nb[1] = nb1;
2115+
dst_row.nb[2] = num_src1_rows*nb1;
2116+
dst_row.nb[3] = num_src1_rows*nb1;
2117+
2118+
ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
2119+
2120+
{
2121+
dim3 block_dims(std::min((unsigned int)ne0, 768u));
2122+
dim3 grid_dims(num_src1_rows);
2123+
k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
2124+
dst_original, dst_contiguous.get(),
2125+
dev_row_mapping.get(),
2126+
ne0,
2127+
nb1, nb2);
2128+
CUDA_CHECK(cudaGetLastError());
2129+
}
21292130
}
21302131
}
21312132
}
@@ -2493,7 +2494,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
24932494
#endif
24942495
}
24952496

2496-
if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
2497+
if (node->op == GGML_OP_MUL_MAT_ID) {
24972498
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
24982499
#ifndef NDEBUG
24992500
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
@@ -3207,7 +3208,9 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
32073208
}
32083209
case GGML_OP_ROPE:
32093210
case GGML_OP_ROPE_BACK: {
3210-
return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
3211+
const size_t ts = ggml_type_size(op->src[0]->type);
3212+
const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
3213+
return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
32113214
}
32123215
case GGML_OP_IM2COL:
32133216
case GGML_OP_POOL_2D:

0 commit comments

Comments
 (0)