Skip to content

Commit aa37417

Browse files
CUDA: fix crash on uneven context without FA (ggml-org#16988)
1 parent 5b180c3 commit aa37417

File tree

7 files changed

+44
-38
lines changed

7 files changed

+44
-38
lines changed

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2113,7 +2113,7 @@ static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
21132113
src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
21142114

21152115
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
2116-
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
2116+
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, is_mul_mat_id ? src1->ne[2] : src1->ne[1]);
21172117

21182118
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft) ||
21192119
ggml_backend_buft_is_cuda_split(src1->buffer->buft);
@@ -2207,16 +2207,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
22072207
const int cc = ggml_cuda_info().devices[id].cc;
22082208
const int warp_size = ggml_cuda_info().devices[id].warp_size;
22092209
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2210-
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
2211-
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
2210+
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2211+
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
22122212
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
22132213
}
22142214
} else {
22152215
const int cc = ggml_cuda_info().devices[ctx.device].cc;
22162216
const int warp_size = ggml_cuda_info().devices[ctx.device].warp_size;
22172217
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
2218-
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src1->ne[1], /*mul_mat_id=*/false);
2219-
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
2218+
use_mul_mat_f = use_mul_mat_f && ggml_cuda_should_use_mmf(src0->type, cc, warp_size, src0->ne, src0->nb, src1->ne[1], /*mul_mat_id=*/false);
2219+
use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src0->nb, src1->ne[1]);
22202220
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_hardware_available(cc);
22212221
}
22222222

@@ -2287,7 +2287,7 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
22872287
return;
22882288
}
22892289

2290-
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src1->ne[2], /*mul_mat_id=*/true)) {
2290+
if (ggml_cuda_should_use_mmf(src0->type, cc, WARP_SIZE, src0->ne, src0->nb, src1->ne[2], /*mul_mat_id=*/true)) {
22912291
ggml_cuda_mul_mat_f(ctx, src0, src1, ids, dst);
22922292
return;
22932293
}

ggml/src/ggml-cuda/mmf.cu

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -119,15 +119,21 @@ void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * sr
119119
}
120120
}
121121

122-
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne, const int src1_ncols, bool mul_mat_id) {
123-
122+
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * src0_ne,
123+
const size_t * src0_nb, const int src1_ncols, bool mul_mat_id) {
124124
if (ggml_is_quantized(type)) {
125125
return false;
126126
}
127127

128-
if (src0_ne[0] % (warp_size * (4/ggml_type_size(type))) != 0) {
128+
const size_t ts = ggml_type_size(type);
129+
if (src0_ne[0] % (warp_size * (4/ts)) != 0) {
129130
return false;
130131
}
132+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
133+
if (src0_nb[i] % (2*ts) != 0) {
134+
return false;
135+
}
136+
}
131137
if (src0_ne[1] % MMF_ROWS_PER_BLOCK != 0) {
132138
return false;
133139
}

ggml/src/ggml-cuda/mmf.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ struct mmf_ids_data {
1717

1818
void ggml_cuda_mul_mat_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
1919

20-
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const int src1_ncols, bool mul_mat_id);
20+
bool ggml_cuda_should_use_mmf(enum ggml_type type, int cc, int warp_size, const int64_t * scr0_ne, const size_t * src0_nb, const int src1_ncols, bool mul_mat_id);
2121

2222
template <typename T, int rows_per_block, int cols_per_block, int nwarps, bool has_ids>
2323
__launch_bounds__(ggml_cuda_get_physical_warp_size()*nwarps, 1)

ggml/src/ggml-cuda/mmvf.cu

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -716,10 +716,16 @@ void ggml_cuda_op_mul_mat_vec_f(
716716
GGML_UNUSED_VARS(ctx, src1, dst, src1_ddq_i, src1_ncols, src1_padded_row_size);
717717
}
718718

719-
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11) {
719+
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11) {
720720
if (src0_ne[0] % 2 != 0) {
721721
return false;
722722
}
723+
const size_t ts = ggml_type_size(type);
724+
for (size_t i = 0; i < GGML_MAX_DIMS; ++i) {
725+
if (src0_nb[i] % (2*ts) != 0) {
726+
return false;
727+
}
728+
}
723729
switch (type) {
724730
case GGML_TYPE_F32:
725731
if (GGML_CUDA_CC_IS_NVIDIA(cc)) {

ggml/src/ggml-cuda/mmvf.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,4 @@ void ggml_cuda_op_mul_mat_vec_f(
99
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
1010
const int64_t src1_padded_row_size, cudaStream_t stream);
1111

12-
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, int64_t ne11);
12+
bool ggml_cuda_should_use_mmvf(enum ggml_type type, int cc, const int64_t * src0_ne, const size_t * src0_nb, int64_t ne11);

src/llama-context.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ llama_context::llama_context(
2121
llama_context_params params) :
2222
model(model),
2323
balloc(std::make_unique<llama_batch_allocr>(model.hparams.n_pos_per_embd())) {
24+
// TODO warning when creating llama_context with awkward ctx size that is not a power of 2,
25+
// may need to be backend-dependent
2426
LLAMA_LOG_INFO("%s: constructing llama_context\n", __func__);
2527

2628
t_start_us = model.t_start_us;

tests/test-backend-ops.cpp

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -3385,11 +3385,11 @@ struct test_mul_mat : public test_case {
33853385
const std::array<int64_t, 2> bs; // dims 3 and 4
33863386
const std::array<int64_t, 2> nr; // repeat in dims 3 and 4
33873387
const std::array<int64_t, 4> per; // permutation of dimensions
3388-
const bool v; // whether a and b are non-contiguous views
3388+
const int64_t k_v; // size of k in memory, resulting in a non-contiguous view for k_v > k, no view for k_v == 0
33893389
const uint32_t o; // number of outputs
33903390

33913391
std::string vars() override {
3392-
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, v, o);
3392+
return VARS_TO_STR10(type_a, type_b, m, n, k, bs, nr, per, k_v, o);
33933393
}
33943394

33953395
double max_nmse_err() override {
@@ -3410,8 +3410,8 @@ struct test_mul_mat : public test_case {
34103410
std::array<int64_t, 2> bs = {10, 10},
34113411
std::array<int64_t, 2> nr = {2, 2},
34123412
std::array<int64_t, 4> per = {0, 1, 2, 3},
3413-
bool v = false, uint32_t o = 1)
3414-
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v), o(o) {}
3413+
int64_t k_v = 0, uint32_t o = 1)
3414+
: type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), k_v(k_v), o(o) {}
34153415

34163416
ggml_tensor * build_graph(ggml_context * ctx) override {
34173417
// C^T = A * B^T: (k, m) * (k, n) => (m, n)
@@ -3421,7 +3421,7 @@ struct test_mul_mat : public test_case {
34213421
const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
34223422
if (npermuted > 0) {
34233423
GGML_ASSERT(npermuted == 2);
3424-
GGML_ASSERT(!v); // not handled
3424+
GGML_ASSERT(k_v == 0); // not handled
34253425
GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
34263426
GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);
34273427

@@ -3445,29 +3445,21 @@ struct test_mul_mat : public test_case {
34453445
ggml_set_name(a, "a_permuted");
34463446
ggml_set_name(b, "b_permuted");
34473447
} else {
3448-
if (v) {
3449-
a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
3450-
b = ggml_new_tensor_4d(ctx, type_b, k*2, n, bs[0]*nr[0], bs[1]*nr[1]);
3448+
const int64_t k_physical = k_v == 0 ? k : k_v;
3449+
a = ggml_new_tensor_4d(ctx, type_a, k_physical, m, bs[0], bs[1]);
3450+
b = ggml_new_tensor_4d(ctx, type_b, k_physical, n, bs[0]*nr[0], bs[1]*nr[1]);
34513451

3452-
if (!ggml_is_quantized(type_a)) {
3453-
if (bs[1] == 1 && nr[1] == 1) {
3454-
ggml_set_param(a);
3455-
}
3456-
ggml_set_param(b);
3452+
if (!ggml_is_quantized(type_a)) {
3453+
if (bs[1] == 1 && nr[1] == 1) {
3454+
ggml_set_param(a);
34573455
}
3456+
ggml_set_param(b);
3457+
}
34583458

3459+
if (k_v != 0) {
3460+
GGML_ASSERT(k_v > k);
34593461
a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
34603462
b = ggml_view_4d(ctx, b, k, n, bs[0]*nr[0], bs[1]*nr[1], b->nb[1], b->nb[2], b->nb[3], 0);
3461-
} else {
3462-
a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0], bs[1]);
3463-
b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
3464-
3465-
if (!ggml_is_quantized(type_a)) {
3466-
if (bs[1] == 1 && nr[1] == 1) {
3467-
ggml_set_param(a);
3468-
}
3469-
ggml_set_param(b);
3470-
}
34713463
}
34723464
ggml_set_name(a, "a");
34733465
ggml_set_name(b, "b");
@@ -6901,7 +6893,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
69016893
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
69026894
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
69036895
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
6904-
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, true, 3));
6896+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 16, 32, 32, { 1, 1}, {1, 1}, {0, 1, 2, 3}, 64, 3));
69056897
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F32, GGML_TYPE_F32, 64, 77, 77, {12,1}, {1,1}));
69066898

69076899
#if 0
@@ -6927,7 +6919,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
69276919
for (uint32_t k = 0; k < 2; ++k) {
69286920
for (ggml_type type: {GGML_TYPE_F16, GGML_TYPE_BF16, GGML_TYPE_F32}) {
69296921
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 1056 + m, 1, 128 + k, {bs, bs2}, {nr, 1}, {0, 2, 1, 3}));
6930-
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, true));
6922+
test_cases.emplace_back(new test_mul_mat(type, GGML_TYPE_F32, 128 + m, 1, 1056 + k, {bs, bs2}, {nr, 1}, {0, 1, 2, 3}, 2*1056 + k));
69316923
}
69326924
}
69336925
}
@@ -7432,7 +7424,7 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
74327424
test_cases.emplace_back(new test_pad_reflect_1d(GGML_TYPE_F32, {3000, 384, 4, 1}));
74337425

74347426
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8, 1}, {4, 1}, {0, 2, 1, 3}));
7435-
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, true));
7427+
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8, 1}, {4, 1}, {0, 1, 2, 3}, 2*16416));
74367428

74377429
for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
74387430
for (ggml_type type_a : all_types) {

0 commit comments

Comments
 (0)