@@ -442,6 +442,11 @@ struct vk_device_struct {
442442 vk_pipeline pipeline_tanh[2];
443443 vk_pipeline pipeline_sigmoid[2];
444444
445+ // [src/dst 0=fp32,1=fp16]
446+ vk_pipeline pipeline_fused_mul_gelu[2];
447+ vk_pipeline pipeline_fused_mul_silu[2];
448+ vk_pipeline pipeline_fused_mul_relu[2];
449+
445450 vk_pipeline pipeline_leaky_relu_f32;
446451 vk_pipeline pipeline_silu_back_f32;
447452 vk_pipeline pipeline_diag_mask_inf_f32;
@@ -2747,6 +2752,13 @@ static void ggml_vk_load_shaders(vk_device& device) {
27472752 CREATE_UNARY(sigmoid)
27482753#undef CREATE_UNARY
27492754
2755+ ggml_vk_create_pipeline(device, device->pipeline_fused_mul_silu[0], "fused_mul_silu_f32", fused_mul_silu_f32_len, fused_mul_silu_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2756+ ggml_vk_create_pipeline(device, device->pipeline_fused_mul_silu[1], "fused_mul_silu_f16", fused_mul_silu_f16_len, fused_mul_silu_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2757+ ggml_vk_create_pipeline(device, device->pipeline_fused_mul_gelu[0], "fused_mul_gelu_f32", fused_mul_gelu_f32_len, fused_mul_gelu_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2758+ ggml_vk_create_pipeline(device, device->pipeline_fused_mul_gelu[1], "fused_mul_gelu_f16", fused_mul_gelu_f16_len, fused_mul_gelu_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2759+ ggml_vk_create_pipeline(device, device->pipeline_fused_mul_relu[0], "fused_mul_relu_f32", fused_mul_relu_f32_len, fused_mul_relu_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2760+ ggml_vk_create_pipeline(device, device->pipeline_fused_mul_relu[1], "fused_mul_relu_f16", fused_mul_relu_f16_len, fused_mul_relu_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
2761+
27502762 ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27512763 ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
27522764
@@ -6393,6 +6405,26 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
63936405 return ctx->device->pipeline_rms_norm_back_f32;
63946406 }
63956407 return nullptr;
6408+ case GGML_OP_FUSED_MUL_UNARY:
6409+ if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
6410+ (src1->type != GGML_TYPE_F32 && src1->type != GGML_TYPE_F16) ||
6411+ (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
6412+ (src0->type != dst->type) || (src1->type != dst->type)) {
6413+ return nullptr;
6414+ } else {
6415+ ggml_unary_op unary_op = (ggml_unary_op)dst->op_params[0];
6416+ switch (unary_op) {
6417+ case GGML_UNARY_OP_SILU:
6418+ return ctx->device->pipeline_fused_mul_silu[dst->type == GGML_TYPE_F16];
6419+ case GGML_UNARY_OP_GELU:
6420+ return ctx->device->pipeline_fused_mul_gelu[dst->type == GGML_TYPE_F16];
6421+ case GGML_UNARY_OP_RELU:
6422+ return ctx->device->pipeline_fused_mul_relu[dst->type == GGML_TYPE_F16];
6423+ default:
6424+ break;
6425+ }
6426+ return nullptr;
6427+ }
63966428 case GGML_OP_UNARY:
63976429 if ((src0->type != GGML_TYPE_F32 && src0->type != GGML_TYPE_F16) ||
63986430 (dst->type != GGML_TYPE_F32 && dst->type != GGML_TYPE_F16) ||
@@ -6830,6 +6862,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
68306862 case GGML_OP_CPY:
68316863 case GGML_OP_CONCAT:
68326864 case GGML_OP_UPSCALE:
6865+ case GGML_OP_FUSED_MUL_UNARY:
68336866 case GGML_OP_UNARY:
68346867 {
68356868 uint32_t ne = ggml_nelements(dst);
@@ -7212,6 +7245,13 @@ static void ggml_vk_unary(ggml_backend_vk_context * ctx, vk_context& subctx, con
72127245 ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
72137246}
72147247
7248+ static void ggml_vk_fused_mul_unary(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
7249+ GGML_ASSERT(ggml_is_contiguous(src0));
7250+ GGML_ASSERT(ggml_are_same_shape(src0, src1));
7251+ GGML_ASSERT(ggml_are_same_shape(src0, dst));
7252+ ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_FUSED_MUL_UNARY, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
7253+ }
7254+
72157255static void ggml_vk_diag_mask_inf(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
72167256 int32_t * op_params = (int32_t *)dst->op_params;
72177257 ggml_vk_op_f32<vk_op_diag_mask_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_DIAG_MASK_INF, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0] }, dryrun);
@@ -8396,6 +8436,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
83968436 return false;
83978437 }
83988438 break;
8439+ case GGML_OP_FUSED_MUL_UNARY:
83998440 case GGML_OP_REPEAT:
84008441 case GGML_OP_REPEAT_BACK:
84018442 case GGML_OP_GET_ROWS:
@@ -8478,6 +8519,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
84788519 case GGML_OP_FUSED_RMS_NORM:
84798520 case GGML_OP_RMS_NORM_BACK:
84808521 case GGML_OP_UNARY:
8522+ case GGML_OP_FUSED_MUL_UNARY:
84818523 case GGML_OP_DIAG_MASK_INF:
84828524 case GGML_OP_SOFT_MAX:
84838525 case GGML_OP_SOFT_MAX_BACK:
@@ -8590,6 +8632,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
85908632 case GGML_OP_RMS_NORM_BACK:
85918633 ggml_vk_rms_norm_back(ctx, compute_ctx, src0, src1, node, dryrun);
85928634
8635+ break;
8636+ case GGML_OP_FUSED_MUL_UNARY:
8637+ ggml_vk_fused_mul_unary(ctx, compute_ctx, src0, src1, node, dryrun);
85938638 break;
85948639 case GGML_OP_UNARY:
85958640 switch (ggml_get_unary_op(node)) {
@@ -8762,6 +8807,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
87628807 case GGML_OP_LEAKY_RELU:
87638808 case GGML_OP_REPEAT:
87648809 case GGML_OP_REPEAT_BACK:
8810+ case GGML_OP_FUSED_MUL_UNARY:
87658811 buf = tensor->buffer;
87668812
87678813 break;
@@ -9445,6 +9491,19 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
94459491 return false;
94469492 }
94479493 break;
9494+ case GGML_OP_FUSED_MUL_UNARY:
9495+ switch ((ggml_unary_op)op->op_params[0]) {
9496+ case GGML_UNARY_OP_GELU:
9497+ case GGML_UNARY_OP_SILU:
9498+ case GGML_UNARY_OP_RELU:
9499+ return ggml_is_contiguous(op->src[0]) && ggml_are_same_shape(op->src[0], op->src[1]) &&
9500+ (op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) &&
9501+ (op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == GGML_TYPE_F16) &&
9502+ (op->src[0]->type == op->type) && (op->src[1]->type == op->type);
9503+ default:
9504+ return false;
9505+ }
9506+ break;
94489507 case GGML_OP_MUL_MAT:
94499508 case GGML_OP_MUL_MAT_ID:
94509509 {
@@ -10169,6 +10228,8 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
1016910228 std::cerr << "Missing vk_check_results OP: " << ggml_op_name(tensor->op) << std::endl;
1017010229 GGML_ABORT("fatal error");
1017110230 }
10231+ } else if (tensor->op == GGML_OP_FUSED_MUL_UNARY) {
10232+ tensor_clone = ggml_fused_mul_unary(ggml_ctx, src_clone[0], src_clone[1], (ggml_unary_op)tensor->op_params[0]);
1017210233 } else if (tensor->op == GGML_OP_CPY || tensor->op == GGML_OP_DUP) {
1017310234 if (src1 == nullptr) {
1017410235 tensor_clone = ggml_dup(ggml_ctx, src_clone[0]);
0 commit comments