Skip to content

Commit 6fd4f95

Browse files
authored
Fix too relaxed check on CUDA "fast copy" (can_be_transposed) condition (ggml-org#17332)
* Fix too relaxed check on CUDA "fast copy" (can_be_transposed) condition * Argh. * Making CISC happy ;) * Integrate CONT tests * Use loopy loop * Skip new tests for (B)F16 for now.
1 parent 980b7cd commit 6fd4f95

File tree

2 files changed

+29
-17
lines changed

2 files changed

+29
-17
lines changed

ggml/src/ggml-cuda/cpy.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -384,7 +384,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
384384
char * src1_ddc = (char *) src1->data;
385385

386386
const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1);
387-
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) && src0->ne[3] == 1;
387+
const bool can_be_transposed = nb01 == (int64_t)ggml_element_size(src0) &&
388+
src0->ne[3] == 1 && nb02 == ne00 * ne01 * (int64_t)ggml_element_size(src0);
388389

389390
if (src0->type == src1->type && contiguous_srcs) {
390391
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));

tests/test-backend-ops.cpp

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2776,24 +2776,34 @@ struct test_cpy : public test_case {
27762776
struct test_cont : public test_case {
27772777
const ggml_type type;
27782778
const std::array<int64_t, 4> ne;
2779+
bool use_view_slice;
27792780

27802781
std::string vars() override {
2781-
return VARS_TO_STR2(type, ne);
2782+
return VARS_TO_STR3(type, ne, use_view_slice);
27822783
}
27832784

27842785
test_cont(ggml_type type = GGML_TYPE_F32,
2785-
std::array<int64_t, 4> ne = {10, 10, 10, 1})
2786-
: type(type), ne(ne) {}
2786+
std::array<int64_t, 4> ne = {10, 10, 10, 1},
2787+
bool use_view_slice = false)
2788+
: type(type), ne(ne), use_view_slice(use_view_slice) {}
27872789

27882790
ggml_tensor * build_graph(ggml_context * ctx) override {
27892791
ggml_tensor * src = ggml_new_tensor(ctx, type, 4, ne.data());
27902792
ggml_set_param(src);
27912793
ggml_set_name(src, "src");
27922794

2793-
src = ggml_transpose(ctx, src);
2794-
ggml_set_name(src, "src_transposed");
27952795

2796-
ggml_tensor * out = ggml_cont(ctx, src);
2796+
ggml_tensor * dst;
2797+
if (use_view_slice) {
2798+
dst = ggml_view_4d(ctx, src, src->ne[0], 1, src->ne[2], src->ne[3],
2799+
src->nb[1], src->nb[2], src->nb[3], src->nb[0] * (src->ne[1] - 1));
2800+
ggml_set_name(dst, "src_view_slice");
2801+
} else {
2802+
dst = ggml_transpose(ctx, src);
2803+
ggml_set_name(dst, "src_transposed");
2804+
}
2805+
2806+
ggml_tensor * out = ggml_cont(ctx, dst);
27972807
ggml_set_name(out, "out");
27982808

27992809
return out;
@@ -6945,16 +6955,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
69456955
test_cases.emplace_back(new test_cpy(GGML_TYPE_BF16, GGML_TYPE_BF16, {256, 4, 1, 1}, {0, 0, 0, 0}, {0, 0, 0, 0}, true));
69466956
test_cases.emplace_back(new test_cpy(GGML_TYPE_F32, GGML_TYPE_F32, {256, 1, 4, 1}, {1, 2, 0, 3}, {0, 0, 0, 0}));
69476957

6948-
test_cases.emplace_back(new test_cont());
6949-
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 1 ,1}));
6950-
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 1, 3 ,5}));
6951-
test_cases.emplace_back(new test_cont(GGML_TYPE_F32, {2, 3, 5 ,7}));
6952-
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 1 ,1}));
6953-
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 1, 3 ,5}));
6954-
test_cases.emplace_back(new test_cont(GGML_TYPE_F16, {2, 3, 5 ,7}));
6955-
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 1 ,1}));
6956-
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 1, 3 ,5}));
6957-
test_cases.emplace_back(new test_cont(GGML_TYPE_BF16, {2, 3, 5 ,7}));
6958+
for (ggml_type type_dst : { GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16 }) {
6959+
for (bool use_view_slice : { true, false }) {
6960+
for (std::array<int64_t, 4> ne : std::initializer_list<std::array<int64_t, 4>>{ {2, 1, 1, 1}, {2, 1, 3, 5},
6961+
{2, 3, 5, 7}, {1, 4, 4, 1}, {1, 8, 17, 1}, {10, 10, 10, 1} }) {
6962+
if (use_view_slice && (type_dst == GGML_TYPE_F16 || type_dst == GGML_TYPE_BF16)) {
6963+
continue; // TODO: add after WebGPU is fixed
6964+
}
6965+
test_cases.emplace_back(new test_cont(type_dst, ne, use_view_slice));
6966+
}
6967+
}
6968+
}
69586969

69596970
auto add_test_bin_bcast = [&](ggml_type type, std::array<int64_t, 4> ne, std::array<int, 4> nr) {
69606971
for (auto op : {ggml_add, ggml_sub, ggml_mul, ggml_div}) {

0 commit comments

Comments
 (0)