Skip to content

Commit c1b1876

Browse files
authored
CUDA: skip fusion for repeating adds in bias (ggml-org#17080)
1 parent b8a5cfd commit c1b1876

File tree

3 files changed

+18
-6
lines changed

3 files changed

+18
-6
lines changed

ggml/src/ggml-cuda/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ if (CUDAToolkit_FOUND)
124124

125125
if (GGML_CUDA_DEBUG)
126126
list(APPEND CUDA_FLAGS -lineinfo)
127+
add_compile_definitions(GGML_CUDA_DEBUG)
127128
endif()
128129

129130
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "12.8")

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3152,8 +3152,6 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
31523152

31533153
for (int i = 0; i < cgraph->n_nodes; i++) {
31543154
ggml_tensor * node = cgraph->nodes[i];
3155-
3156-
31573155
#ifdef GGML_CUDA_DEBUG
31583156
const int nodes_fused = i - prev_i - 1;
31593157
prev_i = i;
@@ -3302,6 +3300,13 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
33023300
continue;
33033301
}
33043302

3303+
// we don't support repeating adds
3304+
if (bias_op == GGML_OP_ADD &&
3305+
(!ggml_are_same_shape(gate_bias_n->src[0], gate_bias_n->src[1]) ||
3306+
!ggml_are_same_shape(up_bias_n->src[0], up_bias_n->src[1]))) {
3307+
continue;
3308+
}
3309+
33053310
const ggml_tensor * src0 = up_n->src[0];
33063311
const ggml_tensor * src1 = up_n->src[1];
33073312
const ggml_tensor * ids = up_n->src[2];
@@ -3411,6 +3416,10 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
34113416
continue;
34123417
}
34133418

3419+
if (bias_op == GGML_OP_ADD && !ggml_are_same_shape(bias_node->src[0], bias_node->src[1])) {
3420+
continue;
3421+
}
3422+
34143423
ggml_cuda_mm_fusion_args_host fusion_data{};
34153424
fusion_data.x_bias = bias_tensor;
34163425

tests/test-backend-ops.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4984,23 +4984,25 @@ struct test_mul_mat_vec_fusion : public test_case {
49844984

49854985
ggml_tensor * build_graph(ggml_context * ctx) override {
49864986
if (!use_id) {
4987-
std::array<int64_t, 4> ne = {k, m, 1, 1};
4988-
std::array<int64_t, 4> ne0 = {k, n, 1, 1};
4987+
const int channels = 4;
4988+
const int samples = 2;
4989+
std::array<int64_t, 4> ne = { k, m, channels, samples };
4990+
std::array<int64_t, 4> ne0 = { k, n, channels, samples };
49894991

49904992
ggml_tensor * cur = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
49914993
ggml_tensor * gate = with_gate ? ggml_new_tensor(ctx, type, 4, ne0.data()) : nullptr;
49924994
ggml_tensor * up = ggml_new_tensor(ctx, type, 4, ne0.data());
49934995

49944996
ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur);
49954997
if (with_bias) {
4996-
std::array<int64_t, 4> bias_ne = {ffn_up->ne[0], 1, 1, 1};
4998+
std::array<int64_t, 4> bias_ne = { ffn_up->ne[0], 1, channels, samples };
49974999
ggml_tensor * up_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
49985000
ffn_up = ggml_add(ctx, ffn_up, up_bias);
49995001
}
50005002

50015003
ggml_tensor * ffn_gate = with_gate ? ggml_mul_mat(ctx, gate, cur) : nullptr;
50025004
if (with_bias && with_gate) {
5003-
std::array<int64_t, 4> bias_ne = {ffn_gate->ne[0], 1, 1, 1};
5005+
std::array<int64_t, 4> bias_ne = { ffn_gate->ne[0], 1, channels, samples };
50045006
ggml_tensor * gate_bias = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, bias_ne.data());
50055007
ffn_gate = ggml_add(ctx, ffn_gate, gate_bias);
50065008
}

0 commit comments

Comments
 (0)