Skip to content

Commit 8a0c38f

Browse files
ikawrakowIwan Kawrakow
andauthored
Vulkan: add GGML_OP_FUSED_MUL_UNARY (#580)
Co-authored-by: Iwan Kawrakow <[email protected]>
1 parent 9534461 commit 8a0c38f

File tree

6 files changed

+142
-7
lines changed

6 files changed

+142
-7
lines changed

ggml/src/ggml-vulkan.cpp

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,11 @@ struct vk_device_struct {
442442
vk_pipeline pipeline_tanh[2];
443443
vk_pipeline pipeline_sigmoid[2];
444444

445+
// [src/dst 0=fp32,1=fp16]
446+
vk_pipeline pipeline_fused_mul_gelu[2];
447+
vk_pipeline pipeline_fused_mul_silu[2];
448+
vk_pipeline pipeline_fused_mul_relu[2];
449+
445450
vk_pipeline pipeline_leaky_relu_f32;
446451
vk_pipeline pipeline_silu_back_f32;
447452
vk_pipeline pipeline_diag_mask_inf_f32;
@@ -2747,6 +2752,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
27472752
CREATE_UNARY(sigmoid)
27482753
#undef CREATE_UNARY
27492754

2755+
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_silu[0], "fused_mul_silu_f32", fused_mul_silu_f32_len, fused_mul_silu_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2756+
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_silu[1], "fused_mul_silu_f16", fused_mul_silu_f16_len, fused_mul_silu_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2757+
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_gelu[0], "fused_mul_gelu_f32", fused_mul_gelu_f32_len, fused_mul_gelu_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2758+
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_gelu[1], "fused_mul_gelu_f16", fused_mul_gelu_f16_len, fused_mul_gelu_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2759+
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_relu[0], "fused_mul_relu_f32", fused_mul_relu_f32_len, fused_mul_relu_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2760+
ggml_vk_create_pipeline(device, device->pipeline_fused_mul_relu[1], "fused_mul_relu_f16", fused_mul_relu_f16_len, fused_mul_relu_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2761+
27502762
ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27512763
ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27522764

@@ -6393,6 +6405,26 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
63936405
return ctx->device->pipeline_rms_norm_back_f32;
63946406
}
63956407
return nullptr;
6408+
case GGML_OP_FUSED_MUL_UNARY:
6409+
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
6410+
(src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) ||
6411+
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
6412+
(src0->type != dst->type) || (src1->type != dst->type)) {
6413+
return nullptr;
6414+
} else {
6415+
ggml_unary_op unary_op = (ggml_unary_op)dst->op_params[0];
6416+
switch (unary_op) {
6417+
case GGML_UNARY_OP_SILU:
6418+
return ctx->device->pipeline_fused_mul_silu[dst->type == GGML_TYPE_F16];
6419+
case GGML_UNARY_OP_GELU:
6420+
return ctx->device->pipeline_fused_mul_gelu[dst->type == GGML_TYPE_F16];
6421+
case GGML_UNARY_OP_RELU:
6422+
return ctx->device->pipeline_fused_mul_relu[dst->type == GGML_TYPE_F16];
6423+
default:
6424+
break;
6425+
}
6426+
return nullptr;
6427+
}
63966428
case GGML_OP_UNARY:
63976429
if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
63986430
(dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
@@ -6830,6 +6862,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68306862
case GGML_OP_CPY:
68316863
case GGML_OP_CONCAT:
68326864
case GGML_OP_UPSCALE:
6865+
case GGML_OP_FUSED_MUL_UNARY:
68336866
case GGML_OP_UNARY:
68346867
{
68356868
uint32_t ne = ggml_nelements(dst);
@@ -7212,6 +7245,13 @@ static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, con
72127245
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
72137246
}
72147247

7248+
static void ggml_vk_fused_mul_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7249+
GGML_ASSERT(ggml_is_contiguous(src0));
7250+
GGML_ASSERT(ggml_are_same_shape(src0, src1));
7251+
GGML_ASSERT(ggml_are_same_shape(src0, dst));
7252+
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_FUSED_MUL_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
7253+
}
7254+
72157255
static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
72167256
int32_t * op_params = (int32_t *)dst->op_params;
72177257
ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
@@ -8396,6 +8436,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83968436
return false;
83978437
}
83988438
break;
8439+
case GGML_OP_FUSED_MUL_UNARY:
83998440
case GGML_OP_REPEAT:
84008441
case GGML_OP_REPEAT_BACK:
84018442
case GGML_OP_GET_ROWS:
@@ -8478,6 +8519,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
84788519
case GGML_OP_FUSED_RMS_NORM:
84798520
case GGML_OP_RMS_NORM_BACK:
84808521
case GGML_OP_UNARY:
8522+
case GGML_OP_FUSED_MUL_UNARY:
84818523
case GGML_OP_DIAG_MASK_INF:
84828524
case GGML_OP_SOFT_MAX:
84838525
case GGML_OP_SOFT_MAX_BACK:
@@ -8590,6 +8632,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
85908632
case GGML_OP_RMS_NORM_BACK:
85918633
ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun);
85928634

8635+
break;
8636+
case GGML_OP_FUSED_MUL_UNARY:
8637+
ggml_vk_fused_mul_unary(ctx, compute_ctx, src0, src1, node, dryrun);
85938638
break;
85948639
case GGML_OP_UNARY:
85958640
switch (ggml_get_unary_op(node)) {
@@ -8762,6 +8807,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
87628807
case GGML_OP_LEAKY_RELU:
87638808
case GGML_OP_REPEAT:
87648809
case GGML_OP_REPEAT_BACK:
8810+
case GGML_OP_FUSED_MUL_UNARY:
87658811
buf = tensor->buffer;
87668812

87678813
break;
@@ -9445,6 +9491,19 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
94459491
return false;
94469492
}
94479493
break;
9494+
case GGML_OP_FUSED_MUL_UNARY:
9495+
switch ((ggml_unary_op)op->op_params[0]) {
9496+
case GGML_UNARY_OP_GELU:
9497+
case GGML_UNARY_OP_SILU:
9498+
case GGML_UNARY_OP_RELU:
9499+
return ggml_is_contiguous(op->src[0]) && ggml_are_same_shape(op->src[0], op->src[1]) &&
9500+
(op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
9501+
(op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
9502+
(op->src[0]->type == op->type) && (op->src[1]->type == op->type);
9503+
default:
9504+
return false;
9505+
}
9506+
break;
94489507
case GGML_OP_MUL_MAT:
94499508
case GGML_OP_MUL_MAT_ID:
94509509
{
@@ -10169,6 +10228,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
1016910228
std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
1017010229
GGML_ABORT("fatal error");
1017110230
}
10231+
} else if (tensor->op == GGML_OP_FUSED_MUL_UNARY) {
10232+
tensor_clone = ggml_fused_mul_unary(ggml_ctx, src_clone[0], src_clone[1], (ggml_unary_op)tensor->op_params[0]);
1017210233
} else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
1017310234
if (src1 == nullptr) {
1017410235
tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#version 450
2+
3+
#include "generic_head.comp"
4+
#include "types.comp"
5+
6+
#extension GL_EXT_control_flow_attributes : enable
7+
8+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
9+
10+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11+
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
12+
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
13+
14+
void main() {
15+
const float GELU_COEF_A = 0.044715f;
16+
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
17+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
18+
19+
if (i >= p.KX) {
20+
return;
21+
}
22+
23+
const float xi = float(data_a[i]);
24+
const float yi = float(data_b[i]);
25+
const float val = SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi);
26+
data_d[i] = D_TYPE(0.5f*xi*yi*(2.0f - 2.0f / (exp(2 * val) + 1)));
27+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#version 450
2+
3+
#include "generic_head.comp"
4+
#include "types.comp"
5+
6+
#extension GL_EXT_control_flow_attributes : enable
7+
8+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
9+
10+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11+
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
12+
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
13+
14+
void main() {
15+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
16+
17+
if (i >= p.KX) {
18+
return;
19+
}
20+
21+
data_d[i] = D_TYPE(float(data_b[i])*max(float(data_a[i]), 0));
22+
}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
#version 450
2+
3+
#include "generic_head.comp"
4+
#include "types.comp"
5+
6+
#extension GL_EXT_control_flow_attributes : enable
7+
8+
layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
9+
10+
layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
11+
layout (binding = 1) readonly buffer Y {B_TYPE data_b[];};
12+
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
13+
14+
void main() {
15+
const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
16+
17+
if (i >= p.KX) {
18+
return;
19+
}
20+
21+
const float xi = float(data_a[i]);
22+
const float yi = float(data_b[i]);
23+
data_d[i] = D_TYPE(xi * yi / (1.0f + exp(-xi)));
24+
}

ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,13 @@ void process_shaders() {
572572

573573
string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
574574

575+
string_to_spv("fused_mul_gelu_f16", "fused_mul_gelu.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
576+
string_to_spv("fused_mul_gelu_f32", "fused_mul_gelu.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
577+
string_to_spv("fused_mul_silu_f16", "fused_mul_silu.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
578+
string_to_spv("fused_mul_silu_f32", "fused_mul_silu.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
579+
string_to_spv("fused_mul_relu_f16", "fused_mul_relu.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
580+
string_to_spv("fused_mul_relu_f32", "fused_mul_relu.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
581+
575582
string_to_spv("gelu_f16", "gelu.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
576583
string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
577584
string_to_spv("gelu_quick_f16", "gelu_quick.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});

src/llama.cpp

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9688,13 +9688,7 @@ static struct ggml_tensor * llm_build_ffn(
96889688
cur = tmp;
96899689
}
96909690

9691-
#ifdef GGML_USE_VULKAN
9692-
constexpr bool use_fused_mul_unary = false;
9693-
#else
9694-
constexpr bool use_fused_mul_unary = true;
9695-
#endif
9696-
9697-
if (use_fused_mul_unary && type_gate == LLM_FFN_PAR &&
9691+
if (type_gate == LLM_FFN_PAR &&
96989692
(type_op == LLM_FFN_SILU || type_op == LLM_FFN_RELU || (type_op == LLM_FFN_GELU && !act_scales))) {
96999693
cur = ggml_fused_mul_unary(ctx, cur, tmp, type_op == LLM_FFN_SILU ? GGML_UNARY_OP_SILU :
97009694
type_op == LLM_FFN_RELU ? GGML_UNARY_OP_RELU : GGML_UNARY_OP_GELU);

0 commit comments

Comments
 (0)