Skip to content

Commit 235c989

Browse files
ikawrakowIwan Kawrakow
andauthored
Vulkan: adding GGML_OP_MULTI_ADD implementation (#582)
Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 3e024de commit 235c989

File tree

3 files changed

+60
-23
lines changed

3 files changed

+60
-23
lines changed

ggml/src/ggml-vulkan.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,8 @@ struct vk_device_struct {
447447
vk_pipeline pipeline_fused_mul_silu[2];
448448
vk_pipeline pipeline_fused_mul_relu[2];
449449

450+
vk_pipeline pipeline_multi_add_f32;
451+
450452
vk_pipeline pipeline_leaky_relu_f32;
451453
vk_pipeline pipeline_silu_back_f32;
452454
vk_pipeline pipeline_diag_mask_inf_f32;
@@ -683,6 +685,13 @@ struct vk_op_unary_push_constants {
683685
};
684686
static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
685687

688+
struct vk_op_multiadd_push_constants {
689+
uint32_t ne;
690+
uint32_t ne0, ne1;
691+
uint32_t nb0, nb01;
692+
uint32_t nadd;
693+
};
694+
686695
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
687696
// Precompute mp (m' in the paper) and L such that division
688697
// can be computed using a multiply (high 32b of 64b result)
@@ -2759,6 +2768,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
27592768
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_relu[0], "fused_mul_relu_f32", fused_mul_relu_f32_len, fused_mul_relu_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27602769
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_relu[1], "fused_mul_relu_f16", fused_mul_relu_f16_len, fused_mul_relu_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27612770

2771+
ggml_vk_create_pipeline(device, device->pipeline_multi_add_f32, "multi_add_f32", multi_add_f32_len, multi_add_f32_data, "main", 2, sizeof(vk_op_multiadd_push_constants), {512, 1, 1}, {}, 1);
2772+
27622773
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27632774
ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27642775

@@ -6451,6 +6462,12 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
64516462
}
64526463
return nullptr;
64536464
}
6465+
case GGML_OP_MULTI_ADD:
6466+
if (src0->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F32 ||
6467+
dst->ne[2] == 1 || dst->ne[3] == 1) {
6468+
return ctx->device->pipeline_multi_add_f32;
6469+
}
6470+
return nullptr;
64546471
case GGML_OP_UNARY:
64556472
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
64566473
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
@@ -6588,6 +6605,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
65886605
case GGML_OP_RMS_NORM:
65896606
case GGML_OP_FUSED_RMS_NORM:
65906607
case GGML_OP_IM2COL:
6608+
case GGML_OP_MULTI_ADD:
65916609
return true;
65926610
default:
65936611
return false;
@@ -6889,6 +6907,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68896907
case GGML_OP_CONCAT:
68906908
case GGML_OP_UPSCALE:
68916909
case GGML_OP_FUSED_MUL_UNARY:
6910+
case GGML_OP_MULTI_ADD:
68926911
case GGML_OP_UNARY:
68936912
{
68946913
uint32_t ne = ggml_nelements(dst);
@@ -7278,6 +7297,12 @@ static void ggml_vk_fused_mul_unary(ggml_backend_vk_context * ctx, vk_context& s
72787297
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_FUSED_MUL_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
72797298
}
72807299

7300+
static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
7301+
uint32_t nadd = (uint32_t)dst->op_params[0];
7302+
ggml_vk_op_f32<vk_op_multiadd_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_MULTI_ADD,
7303+
{ (uint32_t)ggml_nelements(dst), (uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)(dst->nb[1]/sizeof(float)), (uint32_t)(src0->nb[1]/sizeof(float)), nadd }, dryrun);
7304+
}
7305+
72817306
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
72827307
int32_t * op_params = (int32_t *)dst->op_params;
72837308
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
@@ -8463,6 +8488,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
84638488
}
84648489
break;
84658490
case GGML_OP_FUSED_MUL_UNARY:
8491+
case GGML_OP_MULTI_ADD:
84668492
case GGML_OP_REPEAT:
84678493
case GGML_OP_REPEAT_BACK:
84688494
case GGML_OP_GET_ROWS:
@@ -8546,6 +8572,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
85468572
case GGML_OP_RMS_NORM_BACK:
85478573
case GGML_OP_UNARY:
85488574
case GGML_OP_FUSED_MUL_UNARY:
8575+
case GGML_OP_MULTI_ADD:
85498576
case GGML_OP_DIAG_MASK_INF:
85508577
case GGML_OP_SOFT_MAX:
85518578
case GGML_OP_SOFT_MAX_BACK:
@@ -8662,6 +8689,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
86628689
case GGML_OP_FUSED_MUL_UNARY:
86638690
ggml_vk_fused_mul_unary(ctx, compute_ctx, src0, src1, node, dryrun);
86648691
break;
8692+
case GGML_OP_MULTI_ADD:
8693+
ggml_vk_multi_add(ctx, compute_ctx, src0, node, dryrun);
8694+
break;
86658695
case GGML_OP_UNARY:
86668696
switch (ggml_get_unary_op(node)) {
86678697
case GGML_UNARY_OP_SILU:
@@ -8834,6 +8864,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
88348864
case GGML_OP_REPEAT:
88358865
case GGML_OP_REPEAT_BACK:
88368866
case GGML_OP_FUSED_MUL_UNARY:
8867+
case GGML_OP_MULTI_ADD:
88378868
buf = tensor->buffer;
88388869

88398870
break;
@@ -9530,6 +9561,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
95309561
return false;
95319562
}
95329563
break;
9564+
case GGML_OP_MULTI_ADD:
9565+
return op->src[0]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
95339566
case GGML_OP_MUL_MAT:
95349567
case GGML_OP_MUL_MAT_ID:
95359568
{
@@ -10266,6 +10299,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
1026610299
}
1026710300
} else if (tensor->op == GGML_OP_FUSED_MUL_UNARY) {
1026810301
tensor_clone = ggml_fused_mul_unary(ggml_ctx, src_clone[0], src_clone[1], (ggml_unary_op)tensor->op_params[0]);
10302+
} else if (tensor->op == GGML_OP_MULTI_ADD) {
10303+
tensor_clone = ggml_multi_add(ggml_ctx, src_clone[0], tensor->op_params[0]);
1026910304
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
1027010305
if (src1 == nullptr) {
1027110306
tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);

ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -579,6 +579,8 @@ void process_shaders() {
579579
string_to_spv("fused_mul_relu_f16", "fused_mul_relu.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
580580
string_to_spv("fused_mul_relu_f32", "fused_mul_relu.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
581581

582+
string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
583+
582584
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
583585
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
584586
string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});

src/llama.cpp

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9870,28 +9870,28 @@ llm_expert_gating_func_type gating_op,
98709870
cb(cur, "ffn_moe_weighted", il);
98719871
}
98729872

9873-
#ifdef GGML_USE_VULKAN
9874-
// aggregate experts
9875-
ggml_tensor * moe_out = nullptr;
9876-
//ggml_tensor * first_expert = nullptr;
9877-
for (int i = 0; i < n_expert_used; ++i) {
9878-
ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
9879-
experts->nb[2], i*experts->nb[1]);
9880-
9881-
if (i == 0) {
9882-
moe_out = cur_expert;
9883-
} else {
9884-
moe_out = ggml_add(ctx, moe_out, cur_expert);
9885-
}
9886-
}
9887-
9888-
if (n_expert_used == 1) {
9889-
// avoid returning a non-contiguous tensor
9890-
moe_out = ggml_cont(ctx, moe_out);
9891-
}
9892-
9893-
return moe_out;
9894-
#else
9873+
//#ifdef GGML_USE_VULKAN
9874+
// // aggregate experts
9875+
// ggml_tensor * moe_out = nullptr;
9876+
// //ggml_tensor * first_expert = nullptr;
9877+
// for (int i = 0; i < n_expert_used; ++i) {
9878+
// ggml_tensor * cur_expert = ggml_view_2d(ctx, experts, n_embd, n_tokens,
9879+
// experts->nb[2], i*experts->nb[1]);
9880+
//
9881+
// if (i == 0) {
9882+
// moe_out = cur_expert;
9883+
// } else {
9884+
// moe_out = ggml_add(ctx, moe_out, cur_expert);
9885+
// }
9886+
// }
9887+
//
9888+
// if (n_expert_used == 1) {
9889+
// // avoid returning a non-contiguous tensor
9890+
// moe_out = ggml_cont(ctx, moe_out);
9891+
// }
9892+
//
9893+
// return moe_out;
9894+
//#else
98959895
if (n_expert_used == 1) {
98969896
return ggml_cont(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0));
98979897
}
@@ -9900,7 +9900,7 @@ llm_expert_gating_func_type gating_op,
99009900
ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], experts->nb[1]));
99019901
}
99029902
return ggml_multi_add(ctx, ggml_view_2d(ctx, experts, n_embd, n_tokens, experts->nb[2], 0), n_expert_used);
9903-
#endif
9903+
//#endif
99049904

99059905
}
99069906

0 commit comments

Comments
 (0)