Add GEGLU backward (Vulkan) to enable Gemma training.

zoq · zoq · commit d0acca3e393e · 2025-09-30T15:05:47.000-04:00
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -479,6 +479,7 @@ extern "C" {
         GGML_OP_REPEAT_BACK,
         GGML_OP_CONCAT,
         GGML_OP_SILU_BACK,
+        GGML_OP_GEGLU_BACK,
         GGML_OP_NORM, // normalize
         GGML_OP_RMS_NORM,
         GGML_OP_RMS_NORM_BACK,
@@ -1130,6 +1131,12 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+   GGML_API struct ggml_tensor * ggml_geglu_back(
+           struct ggml_context * ctx,
+           struct ggml_tensor  * grad,
+           struct ggml_tensor  * x,
+           struct ggml_tensor  * g);
+
     // hardswish(x) = x * relu6(x + 3) / 6
     GGML_API struct ggml_tensor * ggml_hardswish(
             struct ggml_context * ctx,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1759,6 +1759,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_silu_back(params, tensor);
             } break;
+        case GGML_OP_GEGLU_BACK:
+            {
+                ggml_compute_forward_geglu_back(params, tensor);
+            } break;
         case GGML_OP_NORM:
             {
                 ggml_compute_forward_norm(params, tensor);
@@ -2210,6 +2214,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             }
             break;
         case GGML_OP_SILU_BACK:
+        case GGML_OP_GEGLU_BACK:
         case GGML_OP_MUL:
         case GGML_OP_DIV:
         case GGML_OP_NORM:
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
@@ -3420,6 +3420,71 @@ void ggml_compute_forward_silu_back(
     }
 }
 
+static void ggml_compute_forward_geglu_back_f32(
+        const ggml_compute_params * params,
+        const struct ggml_tensor * grad,
+        const struct ggml_tensor * x,
+        const struct ggml_tensor * g,
+        struct ggml_tensor * dst) {
+
+    GGML_ASSERT(ggml_can_repeat(grad, dst));
+    GGML_ASSERT(ggml_are_same_shape(x, g));
+    GGML_ASSERT(grad->type == GGML_TYPE_F32);
+    GGML_ASSERT(x->type == GGML_TYPE_F32);
+    GGML_ASSERT(g->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc = dst->ne[0];
+
+    GGML_ASSERT(nc % 2 == 0);
+
+    const size_t nb1 = dst->nb[1];
+    const size_t nb2 = dst->nb[2];
+    const size_t nb3 = dst->nb[3];
+
+    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        for (int i2 = 0; i2 < dst->ne[2]; i2++) {
+            for (int i1 = ith; i1 < dst->ne[1]; i1 += nth) {
+                float * dst_ptr = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1);
+                const float * grad_ptr = (const float *)((char *) grad->data + i3*grad->nb[3] + i2*grad->nb[2] + i1*grad->nb[1]);
+                const float * x_ptr = (const float *)((char *) x->data + i3*x->nb[3] + i2*x->nb[2] + i1*x->nb[1]);
+                const float * g_ptr = (const float *)((char *) g->data + i3*g->nb[3] + i2*g->nb[2] + i1*g->nb[1]);
+
+                const int half = nc / 2;
+                ggml_vec_gelu_f32(half, dst_ptr, g_ptr);
+                ggml_vec_mul_f32(half, dst_ptr, dst_ptr, grad_ptr);
+                float * temp = (float *)alloca(half * sizeof(float));
+                ggml_vec_gelu_backward_f32(half, temp, g_ptr, grad_ptr);
+                ggml_vec_mul_f32(half, dst_ptr + half, temp, x_ptr);
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_geglu_back(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const struct ggml_tensor * grad = dst->src[0];
+    const struct ggml_tensor * x = dst->src[1];
+    const struct ggml_tensor * g = dst->src[2];
+
+    switch (dst->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_geglu_back_f32(params, grad, x, g, dst);
+            } break;
+        default:
+            {
+                GGML_ABORT("fatal error");
+            }
+    }
+}
+
+
 // ggml_compute_forward_reglu
 
 static void ggml_compute_forward_reglu_f32(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
@@ -41,6 +41,7 @@ void ggml_compute_forward_repeat(const struct ggml_compute_params * params, stru
 void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_geglu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml-cpu/vec.h b/ggml/src/ggml-cpu/vec.h
@@ -1308,6 +1308,32 @@ inline static void ggml_vec_silu_backward_f16(const int n, ggml_fp16_t * dx, con
     }
 }
 
+inline static float ggml_gelu_backward_f32(float x, float dy) {
+    const float tanh_arg = SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x);
+    const float tanh_val = tanhf(tanh_arg);
+    const float sech2_val = 1.0f - tanh_val * tanh_val;
+    const float dtanh_dx = SQRT_2_OVER_PI * (1.0f + 3.0f * GELU_COEF_A * x * x) * sech2_val;
+    return dy * 0.5f * (1.0f + tanh_val + x * dtanh_dx);
+}
+
+inline static void ggml_vec_gelu_backward_f32(const int n, float * dx, const float * x, const float * dy) {
+    for (int i = 0; i < n; ++i) {
+        dx[i] = ggml_gelu_backward_f32(x[i], dy[i]);
+    }
+}
+
+inline static void ggml_vec_gelu_backward_f16(const int n, ggml_fp16_t * dx, const ggml_fp16_t * x, const ggml_fp16_t * dy) {
+    for (int i = 0; i < n; ++i) {
+        float xi = GGML_CPU_FP16_TO_FP32(x[i]);
+        float tanh_arg = SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi);
+        float tanh_val = tanhf(tanh_arg);
+        float sech2_val = 1.0f - tanh_val * tanh_val;
+        float dtanh_dx = SQRT_2_OVER_PI * (1.0f + 3.0f * GELU_COEF_A * xi * xi) * sech2_val;
+
+        dx[i] = GGML_CPU_FP32_TO_FP16(GGML_CPU_FP16_TO_FP32(dy[i]) * 0.5f * (1.0f + tanh_val + xi * dtanh_dx));
+    }
+}
+
 inline static void ggml_vec_reglu_f32 (const int n, float * y, const float * x, const float * g) {
     for (int i = 0; i < n; ++i) {
         y[i] = (x[i] > 0.f) ? x[i] * g[i] : 0.f;
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -541,6 +541,7 @@ struct vk_device_struct {
 
     vk_pipeline pipeline_leaky_relu_f32;
     vk_pipeline pipeline_silu_back_f32;
+    vk_pipeline pipeline_geglu_back_f32;
     vk_pipeline pipeline_diag_mask_inf_f32;
     vk_pipeline pipeline_cross_entropy_loss_back_f32;
     vk_pipeline pipeline_soft_max_f32, pipeline_soft_max_f32_f16;
@@ -3393,6 +3394,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
     ggml_vk_create_pipeline(device, device->pipeline_leaky_relu_f32, "leaky_relu_f32", leaky_relu_f32_len, leaky_relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_silu_back_f32, "silu_back_f32", silu_back_f32_len, silu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
+    ggml_vk_create_pipeline(device, device->pipeline_geglu_back_f32, "geglu_back_f32", geglu_back_f32_len, geglu_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+
     ggml_vk_create_pipeline(device, device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {1, 512, 1}, {}, 1, true);
 
     ggml_vk_create_pipeline(device, device->pipeline_cross_entropy_loss_back_f32, "cross_entropy_loss_back_f32", cross_entropy_loss_back_f32_len, cross_entropy_loss_back_f32_data, "main", 4, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1);
@@ -7634,6 +7637,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
             return ctx->device->pipeline_silu_back_f32;
         }
         return nullptr;
+    case GGML_OP_GEGLU_BACK:
+        if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+            return ctx->device->pipeline_geglu_back_f32;
+        }
+        return nullptr;
     case GGML_OP_NORM:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
             return ctx->device->pipeline_norm_f32;
@@ -9064,6 +9072,10 @@ static void ggml_vk_silu_back(ggml_backend_vk_context * ctx, vk_context& subctx,
     ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_SILU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
 }
 
+static void ggml_vk_geglu_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
+    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_GEGLU_BACK, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun);
+}
+
 static void ggml_vk_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     float * op_params = (float *)dst->op_params;
 
@@ -10585,6 +10597,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_CONT:
     case GGML_OP_DUP:
     case GGML_OP_SILU_BACK:
+    case GGML_OP_GEGLU_BACK:
     case GGML_OP_NORM:
     case GGML_OP_GROUP_NORM:
     case GGML_OP_RMS_NORM:
@@ -10658,6 +10671,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
         case GGML_OP_CONT:
         case GGML_OP_DUP:
         case GGML_OP_SILU_BACK:
+        case GGML_OP_GEGLU_BACK:
         case GGML_OP_NORM:
         case GGML_OP_GROUP_NORM:
         case GGML_OP_RMS_NORM:
@@ -10872,6 +10886,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
     case GGML_OP_SILU_BACK:
         ggml_vk_silu_back(ctx, compute_ctx, src0, src1, node, dryrun);
 
+        break;
+    case GGML_OP_GEGLU_BACK:
+        ggml_vk_geglu_back(ctx, compute_ctx, src0, src1, node, dryrun);
+
         break;
     case GGML_OP_NORM:
         ggml_vk_norm(ctx, compute_ctx, src0, node, dryrun);
@@ -11116,6 +11134,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
     case GGML_OP_CONT:
     case GGML_OP_DUP:
     case GGML_OP_SILU_BACK:
+    case GGML_OP_GEGLU_BACK:
     case GGML_OP_NORM:
     case GGML_OP_GROUP_NORM:
     case GGML_OP_RMS_NORM:
@@ -12544,6 +12563,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
             return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->src[2]->type == GGML_TYPE_I32 &&
                    op->type == GGML_TYPE_F32;
         case GGML_OP_SILU_BACK:
+        case GGML_OP_GEGLU_BACK:
         case GGML_OP_RMS_NORM_BACK:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/geglu_back.comp b/ggml/src/ggml-vulkan/vulkan-shaders/geglu_back.comp
@@ -0,0 +1,53 @@
+#version 450
+
+#include "generic_head.comp"
+#include "types.comp"
+
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
+
+layout (binding = 0) readonly buffer GRAD {A_TYPE data_grad[];};
+layout (binding = 1) readonly buffer X {B_TYPE data_x[];};
+layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
+
+float gelu(float x) {
+    const float c = 0.797884560802865; // sqrt(2/pi)
+    const float a = 0.044715;
+    const float inner = c * (x + a * x * x * x);
+    return 0.5 * x * (1.0 + tanh(inner));
+}
+
+float gelu_derivative(float x) {
+    const float c = 0.797884560802865; // sqrt(2/pi)
+    const float a = 0.044715;
+    const float x_squared = x * x;
+    const float x_cubed = x_squared * x;
+    const float inner = c * (x + a * x_cubed);
+    const float tanh_val = tanh(inner);
+    const float sech2_val = 1.0 - tanh_val * tanh_val;
+    const float dtanh_dx = c * (1.0 + 3.0 * a * x_squared) * sech2_val;
+    return 0.5 * (1.0 + tanh_val + x * dtanh_dx);
+}
+
+void main() {
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
+
+    if (i >= p.KX) {
+        return;
+    }
+
+    const uint half_size = p.KX / 2;
+
+    if (i < half_size) {
+        const float grad_val = float(data_grad[i]);
+        const float g_val = float(data_x[i + half_size]);
+        data_d[i] = D_TYPE(grad_val * gelu(g_val));
+    } else {
+        const uint idx = i - half_size;
+        const float grad_val = float(data_grad[idx]);
+        const float x_val = float(data_x[idx]);
+        const float g_val = float(data_x[i]);
+        data_d[i] = D_TYPE(grad_val * x_val * gelu_derivative(g_val));
+    }
+}
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -684,6 +684,7 @@ void process_shaders() {
 
     string_to_spv("leaky_relu_f32", "leaky_relu.comp",  {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
     string_to_spv("silu_back_f32",  "silu_back.comp",   {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
+    string_to_spv("geglu_back_f32", "geglu_back.comp",  {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
 
     string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
 
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -942,6 +942,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "REPEAT_BACK",
     "CONCAT",
     "SILU_BACK",
+    "GEGLU_BACK",
     "NORM",
     "RMS_NORM",
     "RMS_NORM_BACK",
@@ -1019,7 +1020,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "GLU",
 };
 
-static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
+static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1046,6 +1047,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "repeat_back(x)",
     "concat(x, y)",
     "silu_back(x)",
+    "geglu_back(x)",
     "norm(x)",
     "rms_norm(x)",
     "rms_norm_back(x)",
@@ -1123,7 +1125,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "glu(x)",
 };
 
-static_assert(GGML_OP_COUNT == 90, "GGML_OP_COUNT != 90");
+static_assert(GGML_OP_COUNT == 91, "GGML_OP_COUNT != 91");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -2667,6 +2669,22 @@ struct ggml_tensor * ggml_silu_back(
     return result;
 }
 
+// ggml_geglu_back
+struct ggml_tensor * ggml_geglu_back(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * grad,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * g) {
+    struct ggml_tensor * result = ggml_dup_tensor(ctx, x);
+
+    result->op     = GGML_OP_GEGLU_BACK;
+    result->src[0] = grad;
+    result->src[1] = x;
+    result->src[2] = g;
+
+    return result;
+}
+
 // ggml hardswish
 
 struct ggml_tensor * ggml_hardswish(
@@ -6389,6 +6407,16 @@ static void ggml_compute_backward(
                         ggml_add_or_set(ctx, cgraph, isrc1, ggml_mul(ctx, ggml_silu(ctx, src0), grad));
                     }
                 } break;
+                case GGML_GLU_OP_GEGLU: {
+                    if (src0_needs_grads) {
+                        GGML_ASSERT(src1 && "backward pass only implemented for split geglu");
+                        ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, grad, ggml_gelu(ctx, src1)));
+                    }
+                    if (src1_needs_grads) {
+                        struct ggml_tensor * grad_mul_src0 = ggml_mul(ctx, grad, src0);
+                        ggml_add_or_set(ctx, cgraph, isrc1, ggml_geglu_back(ctx, grad_mul_src0, src1, src1));
+                    }
+                } break;
                 default: {
                     GGML_ABORT("unsupported glu op for backward pass: %s", ggml_glu_op_name(ggml_get_glu_op(tensor)));
                 } //break;

Original file line number	Diff line number	Diff line change
`@@ -1759,6 +1759,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm`
`1759`	`1759`	`{`
`1760`	`1760`	`ggml_compute_forward_silu_back(params, tensor);`
`1761`	`1761`	`} break;`
	`1762`	`+ case GGML_OP_GEGLU_BACK:`
	`1763`	`+ {`
	`1764`	`+ ggml_compute_forward_geglu_back(params, tensor);`
	`1765`	`+ } break;`
`1762`	`1766`	`case GGML_OP_NORM:`
`1763`	`1767`	`{`
`1764`	`1768`	`ggml_compute_forward_norm(params, tensor);`
`@@ -2210,6 +2214,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {`
`2210`	`2214`	`}`
`2211`	`2215`	`break;`
`2212`	`2216`	`case GGML_OP_SILU_BACK:`
	`2217`	`+ case GGML_OP_GEGLU_BACK:`
`2213`	`2218`	`case GGML_OP_MUL:`
`2214`	`2219`	`case GGML_OP_DIV:`
`2215`	`2220`	`case GGML_OP_NORM:`