diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 75fd6db14c514..1c283bc1543b8 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2007,6 +2007,95 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
     }
 }
 
+static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up,
+                                          const ggml_tensor * ffn_gate,
+                                          const ggml_tensor * glu) {
+    bool is_mul_mat = ffn_up->op == GGML_OP_MUL_MAT && ffn_gate->op == GGML_OP_MUL_MAT && glu->op == GGML_OP_GLU;
+    bool is_mul_mat_id =
+        ffn_up->op == GGML_OP_MUL_MAT_ID && ffn_gate->op == GGML_OP_MUL_MAT_ID && glu->op == GGML_OP_GLU;
+
+    GGML_ASSERT(ffn_up && ffn_gate && glu);
+
+    if (!is_mul_mat && !is_mul_mat_id) {
+        return false;
+    }
+
+    if (ffn_up->src[0]->type != ffn_gate->src[0]->type || !ggml_are_same_shape(ffn_up->src[0], ffn_gate->src[0]) ||
+        !ggml_are_same_stride(ffn_up->src[0], ffn_gate->src[0])) {
+        return false;
+    }
+
+    if (ffn_up->src[1] != ffn_gate->src[1]) {
+        return false;
+    }
+
+    if (ffn_up->src[2] && (ffn_up->src[2] != ffn_gate->src[2])) {
+        return false;
+    }
+
+    if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) {
+        return false;
+    }
+
+    static constexpr std::array<ggml_glu_op, 2> valid_glu_ops = { GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU };
+
+    if (std::find(valid_glu_ops.begin(), valid_glu_ops.end(), ggml_get_glu_op(glu)) == valid_glu_ops.end()) {
+        return false;
+    }
+
+    if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) {
+        return false;
+    }
+
+    const bool split = ggml_backend_buft_is_cuda_split(ffn_up->src[0]->buffer->buft) ||
+                       ggml_backend_buft_is_cuda_split(ffn_gate->src[0]->buffer->buft);
+
+    //TODO: add support for fusion for split buffers
+    if (split) {
+        return false;
+    }
+
+    return true;
+}
+
+static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) {
+    ggml_tensor *       src0 = tensor->src[0];
+    ggml_tensor *       src1 = tensor->src[1];
+    const ggml_tensor * dst  = tensor;
+
+    bool use_mul_mat_vec_f =
+        (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) &&
+        src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
+
+    const int cc      = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
+    use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]);
+
+    if (tensor->op == GGML_OP_MUL_MAT_ID) {
+        use_mul_mat_vec_f = use_mul_mat_vec_f && dst->ne[2] == 1;
+    }
+
+    return use_mul_mat_vec_f;
+}
+
+static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) {
+    ggml_tensor *       src0 = tensor->src[0];
+    ggml_tensor *       src1 = tensor->src[1];
+    const ggml_tensor * dst  = tensor;
+
+    const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE &&
+                                   ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) &&
+                                   src0->view_src;
+
+    bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 &&
+                             dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
+
+    if (tensor->op == GGML_OP_MUL_MAT_ID) {
+        use_mul_mat_vec_q = use_mul_mat_vec_q && dst->ne[2] == 1;
+    }
+
+    return use_mul_mat_vec_q;
+}
+
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
@@ -2855,6 +2944,25 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx,
         }
     }
 
+    std::initializer_list<enum ggml_op> mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU };
+
+    std::initializer_list<enum ggml_op> mul_mat_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT, GGML_OP_GLU };
+
+    if (ops.size() == 3 && (std::equal(ops.begin(), ops.end(), mul_mat_id_glu_ops.begin()) ||
+                            std::equal(ops.begin(), ops.end(), mul_mat_glu_ops.begin()))) {
+        if (node_idx + 2 >= cgraph->n_nodes) {
+            return false;
+        }
+
+        const ggml_tensor * ffn_gate = cgraph->nodes[node_idx];
+        const ggml_tensor * ffn_up   = cgraph->nodes[node_idx + 1];
+        const ggml_tensor * glu      = cgraph->nodes[node_idx + 2];
+
+        if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) {
+            return true;
+        }
+    }
+
     if (!ggml_can_fuse(cgraph, node_idx, ops)) {
         return false;
     }
@@ -2992,6 +3100,36 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx
                         }
                     }
 
+                    bool fused_mul_mat_vec = false;
+
+                    for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) {
+                        if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) {
+                            ggml_tensor * glu  = cgraph->nodes[i + 2];
+                            ggml_tensor * gate = glu->src[0];
+                            ggml_tensor * up   = glu->src[1];
+
+                            const ggml_tensor * src0 = up->src[0];
+                            const ggml_tensor * src1 = up->src[1];
+                            const ggml_tensor * ids  = up->src[2];
+
+                            if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) {
+                                ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu);
+                                fused_mul_mat_vec = true;
+                                break;
+                            }
+
+                            if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) {
+                                ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu);
+                                fused_mul_mat_vec = true;
+                                break;
+                            }
+                        }
+                    }
+
+                    if (fused_mul_mat_vec) {
+                        i += 2;
+                        continue;
+                    }
 
                     if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) {
                         ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]);
diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu
index 57ab839393aa0..04682cc5f43b5 100644
--- a/ggml/src/ggml-cuda/mmvf.cu
+++ b/ggml/src/ggml-cuda/mmvf.cu
@@ -2,13 +2,14 @@
 #include "common.cuh"
 #include "convert.cuh"
 #include "mmvf.cuh"
+#include "unary.cuh"
 
-template <typename T, typename type_acc, int ncols_dst, int block_size>
+template <typename T, typename type_acc, int ncols_dst, int block_size, bool has_gate = false>
 static __global__ void mul_mat_vec_f(
-        const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
+        const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const T * __restrict__ gate_x, float * __restrict__ dst,
         const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst,
         const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
-        const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
+        const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const ggml_glu_op glu_op) {
     const int row         = blockIdx.x;
     const int channel_dst = blockIdx.y;
     const int channel_x   = ids ? ids[channel_dst]          : fastdiv((uint32_t) channel_dst, channel_ratio);
@@ -24,58 +25,85 @@ static __global__ void mul_mat_vec_f(
     y   += int64_t(sample_y)  *stride_sample_y   + channel_y  *stride_channel_y;
     dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst;
 
+    if constexpr (has_gate) {
+        gate_x += int64_t(sample_x)  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
+    }
+
     const float2 * y2 = (const float2 *) y;
 
     extern __shared__ char data_mmv[];
     float * buf_iw = (float *) data_mmv;
+    float * buf_iw_gate = has_gate ? (float *) (data_mmv + warp_size*sizeof(float)) : nullptr;
 
     if (block_size > warp_size) {
         if (tid < warp_size) {
             buf_iw[tid] = 0.0f;
+            if constexpr (has_gate) {
+                buf_iw_gate[tid] = 0.0f;
+            }
         }
         __syncthreads();
     }
 
     float sumf[ncols_dst] = {0.0f};
+    float sumf_gate[ncols_dst] = {0.0f};
 
     if constexpr (std::is_same_v<T, float>) {
         const float2 * x2 = (const float2 *) x;
+        const float2 * gate_x2 = has_gate ? (const float2 *) gate_x : nullptr;
 
         for (int col2 = tid; col2 < ncols2; col2 += block_size) {
             const float2 tmpx = x2[col2];
+            const float2 tmpx_gate = has_gate ? gate_x2[col2] : make_float2(0.0f, 0.0f);
 
 #pragma unroll
             for (int j = 0; j < ncols_dst; ++j) {
                 const float2 tmpy = y2[j*stride_col_y2 + col2];
                 ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
                 ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
+
+                if constexpr (has_gate) {
+                    ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
+                    ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
+                }
             }
         }
     } else if constexpr (std::is_same_v<T, half>) {
         const half2 * x2 = (const half2 *) x;
+        const half2 * gate_x2 = has_gate ? (const half2 *) gate_x : nullptr;
 
         if (std::is_same_v<type_acc, float>) {
             for (int col2 = tid; col2 < ncols2; col2 += block_size) {
                 const float2 tmpx = __half22float2(x2[col2]);
-
+                const float2 tmpx_gate = has_gate ? __half22float2(gate_x2[col2]) : make_float2(0.0f, 0.0f);
 #pragma unroll
                 for (int j = 0; j < ncols_dst; ++j) {
                     const float2 tmpy = y2[j*stride_col_y2 + col2];
                     ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
                     ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
+
+                    if constexpr (has_gate) {
+                        ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
+                        ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
+                    }
                 }
             }
         } else {
 #ifdef FP16_AVAILABLE
             half2 sumh2[ncols_dst] = {{0.0f, 0.0f}};
+            half2 sumh2_gate[ncols_dst] = {{0.0f, 0.0f}};
 
             for (int col2 = tid; col2 < ncols2; col2 += block_size) {
                 const half2 tmpx = x2[col2];
-
+                const half2 tmpx_gate = has_gate ? gate_x2[col2] : make_half2(0.0f, 0.0f);
 #pragma unroll
                 for (int j = 0; j < ncols_dst; ++j) {
                     const float2 tmpy = y2[j*stride_col_y2 + col2];
                     sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y);
+
+                    if constexpr (has_gate) {
+                        sumh2_gate[j] += tmpx_gate * make_half2(tmpy.x, tmpy.y);
+                    }
                 }
             }
 
@@ -83,6 +111,12 @@ static __global__ void mul_mat_vec_f(
             for (int j = 0; j < ncols_dst; ++j) {
                 sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]);
             }
+
+            if constexpr (has_gate) {
+                for (int j = 0; j < ncols_dst; ++j) {
+                    sumf_gate[j] = __low2float(sumh2_gate[j]) + __high2float(sumh2_gate[j]);
+                }
+            }
 #else
             NO_DEVICE_CODE;
 #endif // FP16_AVAILABLE
@@ -91,8 +125,10 @@ static __global__ void mul_mat_vec_f(
 //TODO: add support for ggml_cuda_mad for hip_bfloat162
 #if defined(GGML_USE_HIP)
         const int * x2 = (const int *) x;
+        const int * gate_x2 = has_gate ? (const int *) gate_x : nullptr;
         for (int col2 = tid; col2 < ncols2; col2 += block_size) {
             const int tmpx = x2[col2];
+            const int tmpx_gate = has_gate ? gate_x2[col2] : 0;
 #pragma unroll
             for (int j = 0; j < ncols_dst; ++j) {
                 const float2 tmpy = y2[j*stride_col_y2 + col2];
@@ -100,17 +136,34 @@ static __global__ void mul_mat_vec_f(
                 const float tmpx1 = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx)[1]);
                 ggml_cuda_mad(sumf[j], tmpx0, tmpy.x);
                 ggml_cuda_mad(sumf[j], tmpx1, tmpy.y);
+
+                if constexpr (has_gate) {
+                    const float tmpx0 = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx_gate)[0]);
+                    const float tmpx1 = ggml_cuda_cast<float>(reinterpret_cast<const nv_bfloat16 *>(&tmpx_gate)[1]);
+                    ggml_cuda_mad(sumf_gate[j], tmpx0, tmpy.x);
+                    ggml_cuda_mad(sumf_gate[j], tmpx1, tmpy.y);
+                }
             }
         }
 #else
         const nv_bfloat162 * x2 = (const nv_bfloat162 *) x;
+        const nv_bfloat162 * gate_x2 = has_gate ? (const nv_bfloat162 *) gate_x : nullptr;
         for (int col2 = tid; col2 < ncols2; col2 += block_size) {
             const nv_bfloat162 tmpx = x2[col2];
+            nv_bfloat162 tmpx_gate;
+            if constexpr (has_gate) {
+                tmpx_gate = gate_x2[col2];
+            }
 #pragma unroll
             for (int j = 0; j < ncols_dst; ++j) {
                 const float2 tmpy = y2[j*stride_col_y2 + col2];
                 ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x);
                 ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y);
+
+                if constexpr (has_gate) {
+                    ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x);
+                    ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y);
+                }
             }
         }
 #endif
@@ -122,13 +175,25 @@ static __global__ void mul_mat_vec_f(
     for (int j = 0; j < ncols_dst; ++j) {
         sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
 
+        if constexpr (has_gate) {
+            sumf_gate[j] = warp_reduce_sum<warp_size>(sumf_gate[j]);
+        }
+
         if (block_size > warp_size) {
             buf_iw[tid/warp_size] = sumf[j];
+            if constexpr (has_gate) {
+                buf_iw_gate[tid/warp_size] = sumf_gate[j];
+            }
             __syncthreads();
             if (tid < warp_size) {
                 sumf[j] = buf_iw[tid];
                 sumf[j] = warp_reduce_sum<warp_size>(sumf[j]);
+                if constexpr (has_gate) {
+                    sumf_gate[j] = buf_iw_gate[tid];
+                    sumf_gate[j] = warp_reduce_sum<warp_size>(sumf_gate[j]);
+                }
             }
+
             if (j < ncols_dst) {
                 __syncthreads();
             }
@@ -139,18 +204,35 @@ static __global__ void mul_mat_vec_f(
         return;
     }
 
-    dst[tid*stride_col_dst + row] = sumf[tid];
+    if constexpr (has_gate) {
+        float gate_value = sumf_gate[tid];
+        float gated_value;
+        switch (glu_op) {
+            case GGML_GLU_OP_SWIGLU:
+                gated_value = ggml_cuda_op_silu_single(gate_value);
+                break;
+            case GGML_GLU_OP_GEGLU:
+                gated_value = ggml_cuda_op_gelu_single(gate_value);
+                break;
+            default:
+                gated_value = gate_value;
+                break;
+        }
+        dst[tid*stride_col_dst + row] = sumf[tid] * gated_value;
+    } else {
+        dst[tid*stride_col_dst + row] = sumf[tid];
+    }
 }
 
-template <typename T, typename type_acc, int ncols_dst>
+template <typename T, typename type_acc, int ncols_dst, bool has_gate>
 static void launch_mul_mat_vec_f_cuda(
-        const T * x, const float * y, const int32_t * ids, float * dst,
+        const T * x, const float * y, const int32_t * ids, const T * gate_x, float * dst,
         const int64_t ncols, const int64_t nrows,
         const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
         const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
         const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
-        cudaStream_t stream) {
+        const ggml_glu_op glu_op, cudaStream_t stream) {
     GGML_ASSERT(ncols        % 2 == 0);
     GGML_ASSERT(stride_row   % 2 == 0);
     GGML_ASSERT(stride_col_y % 2 == 0);
@@ -176,57 +258,57 @@ static void launch_mul_mat_vec_f_cuda(
         }
     }
 
-    const int nbytes_shared = warp_size*sizeof(float);
+    const int nbytes_shared = warp_size*sizeof(float) + (has_gate ? warp_size*sizeof(float) : 0);
     const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
     const dim3 block_dims(block_size_best, 1, 1);
     switch (block_size_best) {
         case   32: {
-            mul_mat_vec_f<T, type_acc, ncols_dst,  32><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst,  32, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case   64: {
-            mul_mat_vec_f<T, type_acc, ncols_dst,  64><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst,  64, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case   96: {
-            mul_mat_vec_f<T, type_acc, ncols_dst,  96><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst,  96, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case  128: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 128><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 128, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case  160: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 160><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 160, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case  192: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 192><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 192, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case  224: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 224><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 224, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case  256: {
-            mul_mat_vec_f<T, type_acc, ncols_dst, 256><<<block_nums, block_dims, nbytes_shared, stream>>>
-                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
+            mul_mat_vec_f<T, type_acc, ncols_dst, 256, has_gate><<<block_nums, block_dims, nbytes_shared, stream>>>
+                (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         default: {
             GGML_ABORT("fatal error");
@@ -234,63 +316,63 @@ static void launch_mul_mat_vec_f_cuda(
     }
 }
 
-template <typename T, typename type_acc>
+template <typename T, typename type_acc, bool has_gate = false>
 static void mul_mat_vec_f_cuda_switch_ncols_dst(
-        const T * x, const float * y, const int32_t * ids, float * dst,
+        const T * x, const float * y, const int32_t * ids, const T * gate_x, float * dst,
         const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
         const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst,
         const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
         const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
-        cudaStream_t stream) {
+        const ggml_glu_op glu_op, cudaStream_t stream) {
     switch (ncols_dst) {
         case 1:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 1>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 1, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         case 2:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 2>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 2, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         case 3:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 3>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 3, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         case 4:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 4>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 4, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         case 5:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 5>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 5, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         case 6:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 6>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 6, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         case 7:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 7>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 7, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         case 8:
-            launch_mul_mat_vec_f_cuda<T, type_acc, 8>
-                (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
+            launch_mul_mat_vec_f_cuda<T, type_acc, 8, has_gate>
+                (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream);
             break;
         default:
             GGML_ABORT("fatal error");
@@ -300,29 +382,48 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst(
 
 template<typename T>
 static void mul_mat_vec_f_cuda(
-        const T * x, const float * y, const int32_t * ids, float * dst,
+        const T * x, const float * y, const int32_t * ids, const T * gate_x, ggml_glu_op op, float * dst,
         const int64_t ncols, const int64_t nrows, const int64_t ncols_dst,
         const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst,
         const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
         const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
         enum ggml_prec prec, cudaStream_t stream) {
-    if constexpr(std::is_same_v<T, half>) {
-        if (prec == GGML_PREC_DEFAULT) {
-            mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
-                (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
-                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
-            return;
+
+    if (gate_x) {
+        if constexpr(std::is_same_v<T, half>) {
+            if (prec == GGML_PREC_DEFAULT) {
+                mul_mat_vec_f_cuda_switch_ncols_dst<T, half, true>
+                    (x, y, ids, gate_x, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+                    nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                    stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream);
+                return;
+            }
         }
+        mul_mat_vec_f_cuda_switch_ncols_dst<T, float, true>
+            (x, y, ids, gate_x, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+            nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+            stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream);
+    }
+    else {
+        if constexpr(std::is_same_v<T, half>) {
+            if (prec == GGML_PREC_DEFAULT) {
+                mul_mat_vec_f_cuda_switch_ncols_dst<T, half>
+                    (x, y, ids, nullptr, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+                    nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                    stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream);
+                return;
+            }
+        }
+        mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
+            (x, y, ids, nullptr, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
+            nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+            stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream);
     }
-    mul_mat_vec_f_cuda_switch_ncols_dst<T, float>
-        (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst,
-         nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
-         stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
 }
 
-void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
+    const ggml_tensor * src0_gate, const ggml_tensor * glu) {
     GGML_ASSERT(        src1->type == GGML_TYPE_F32);
     GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
     GGML_ASSERT(         dst->type == GGML_TYPE_F32);
@@ -341,6 +442,8 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
     GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
     GGML_ASSERT(        nb0        == ts_dst);
 
+    GGML_ASSERT(!src0_gate || ggml_are_same_stride(src0_gate, src0));
+
     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
     const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
 
@@ -367,22 +470,27 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor
 
     GGML_ASSERT(!ids || ncols_dst == 1);
 
+    ggml_glu_op glu_op = glu ? ggml_get_glu_op(glu) : ggml_glu_op::GGML_GLU_OP_SWIGLU;
+
     switch (src0->type) {
         case GGML_TYPE_F32: {
             const float * src0_d = (const float *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            const float * src0_gate_d = src0_gate ? (const float *) src0_gate->data : nullptr;
+            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, src0_gate_d, glu_op, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                 ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                 ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
         case GGML_TYPE_F16: {
             const half * src0_d = (const half *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            const half * src0_gate_d = src0_gate ? (const half *) src0_gate->data : nullptr;
+            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, src0_gate_d, glu_op, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                 ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                 ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
         case GGML_TYPE_BF16: {
             const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
-            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
+            const nv_bfloat16 * src0_gate_d = src0_gate ? (const nv_bfloat16 *) src0_gate->data : nullptr;
+            mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, src0_gate_d, glu_op, dst_d, ne00, ne01, ncols_dst, s01, s11, s1,
                 ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
                 ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
@@ -429,19 +537,22 @@ void ggml_cuda_op_mul_mat_vec_f(
     switch (src0->type) {
         case GGML_TYPE_F32: {
             const float * src0_d = (const float *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            const float * src0_gate_d = nullptr;
+            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, src0_gate_d, ggml_glu_op::GGML_GLU_OP_SWIGLU, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
         case GGML_TYPE_F16: {
             const half * src0_d = (const half *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            const half * src0_gate_d = nullptr;
+            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, src0_gate_d, ggml_glu_op::GGML_GLU_OP_SWIGLU, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
         case GGML_TYPE_BF16: {
             const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
-            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
+            const nv_bfloat16 * src0_gate_d = nullptr;
+            mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, src0_gate_d, ggml_glu_op::GGML_GLU_OP_SWIGLU, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst,
                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
diff --git a/ggml/src/ggml-cuda/mmvf.cuh b/ggml/src/ggml-cuda/mmvf.cuh
index 1da460992e784..4dc4ad42759d2 100644
--- a/ggml/src/ggml-cuda/mmvf.cuh
+++ b/ggml/src/ggml-cuda/mmvf.cuh
@@ -1,6 +1,7 @@
 #include "common.cuh"
 
-void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
+    const ggml_tensor * src0_gate = nullptr, const ggml_tensor * glu = nullptr);
 
 void ggml_cuda_op_mul_mat_vec_f(
     ggml_backend_cuda_context & ctx,
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index 3bf0c9ed25038..c99234693bf2f 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -1,5 +1,6 @@
 #include "mmvq.cuh"
 #include "quantize.cuh"
+#include "unary.cuh"
 #include "vecdotq.cuh"
 
 #include <cstdint>
@@ -136,15 +137,15 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int
     return 1;
 }
 
-template <ggml_type type, int ncols_dst>
+template <ggml_type type, int ncols_dst, bool has_gate>
 // tell the compiler to use as many registers as it wants, see nwarps definition below
 __launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void mul_mat_vec_q(
-        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
+        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const void * __restrict__ vgate, float * __restrict__ dst,
         const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y,
         const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x,
         const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio,
-        const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) {
+        const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst, const ggml_glu_op glu_op) {
 
     constexpr int qk  = ggml_cuda_type_traits<type>::qk;
     constexpr int qi  = ggml_cuda_type_traits<type>::qi;
@@ -171,6 +172,7 @@ static __global__ void mul_mat_vec_q(
 
     // partial sum for each thread
     float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
+    float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}};
 
     const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
     const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
@@ -187,17 +189,28 @@ static __global__ void mul_mat_vec_q(
             for (int i = 0; i < rows_per_cuda_block; ++i) {
                 tmp[j][i] += vec_dot_q_cuda(
                     vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
+                if constexpr (has_gate) {
+                    tmp_gate[j][i] += vec_dot_q_cuda(
+                        vgate, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
+                }
             }
         }
     }
 
     __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
+    __shared__ float tmp_shared_gate[(has_gate && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
+    if constexpr (!has_gate) {
+        (void) tmp_shared_gate;
+    }
     if (threadIdx.y > 0) {
 #pragma unroll
         for (int j = 0; j < ncols_dst; ++j) {
 #pragma unroll
             for (int i = 0; i < rows_per_cuda_block; ++i) {
                 tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
+                if constexpr (has_gate) {
+                    tmp_shared_gate[threadIdx.y-1][j][i][threadIdx.x] = tmp_gate[j][i];
+                }
             }
         }
     }
@@ -216,12 +229,35 @@ static __global__ void mul_mat_vec_q(
 #pragma unroll
             for (int l = 0; l < nwarps-1; ++l) {
                 tmp[j][i] += tmp_shared[l][j][i][threadIdx.x];
+                if constexpr (has_gate) {
+                    tmp_gate[j][i] += tmp_shared_gate[l][j][i][threadIdx.x];
+                }
             }
             tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
+            if constexpr (has_gate) {
+                tmp_gate[j][i] = warp_reduce_sum<warp_size>(tmp_gate[j][i]);
+            }
         }
 
         if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) {
-            dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
+            float result = tmp[j][threadIdx.x];
+            if constexpr (has_gate) {
+                float gate_value = tmp_gate[j][threadIdx.x];
+                float gated_value;
+                switch (glu_op) {
+                    case GGML_GLU_OP_SWIGLU:
+                        gated_value = ggml_cuda_op_silu_single(gate_value);
+                        break;
+                    case GGML_GLU_OP_GEGLU:
+                        gated_value = ggml_cuda_op_gelu_single(gate_value);
+                        break;
+                    default:
+                        gated_value = gate_value;
+                        break;
+                }
+                result = result * gated_value;
+            }
+            dst[j*stride_col_dst + threadIdx.x] = result;
         }
     }
 }
@@ -235,15 +271,15 @@ static std::pair<dim3, dim3> calc_launch_params(
     return {block_nums, block_dims};
 }
 
-template <ggml_type type>
+template <ggml_type type, bool has_gate = false>
 static void mul_mat_vec_q_switch_ncols_dst(
-        const void * vx, const void * vy, const int32_t * ids, float * dst,
+        const void * vx, const void * vy, const int32_t * ids, const void * vgate, float * dst,
         const int ncols_x, const int nrows_x, const int ncols_dst,
         const int stride_row_x, const int stride_col_y, const int stride_col_dst,
         const int nchannels_x, const int nchannels_y, const int nchannels_dst,
         const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
         const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
-        cudaStream_t stream) {
+        const ggml_glu_op glu_op, cudaStream_t stream) {
 
     GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
     GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
@@ -261,66 +297,66 @@ static void mul_mat_vec_q_switch_ncols_dst(
         case 1: {
             constexpr int c_ncols_dst = 1;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case 2: {
             constexpr int c_ncols_dst = 2;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case 3: {
             constexpr int c_ncols_dst = 3;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case 4: {
             constexpr int c_ncols_dst = 4;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case 5: {
             constexpr int c_ncols_dst = 5;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case 6: {
             constexpr int c_ncols_dst = 6;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case 7: {
             constexpr int c_ncols_dst = 7;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         case 8: {
             constexpr int c_ncols_dst = 8;
             std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
-                (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q<type, c_ncols_dst, has_gate><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst,
                  channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst);
+                 sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op);
         } break;
         default:
             GGML_ABORT("fatal error");
@@ -328,154 +364,155 @@ static void mul_mat_vec_q_switch_ncols_dst(
     }
 }
 
+template <bool has_gate = false>
 static void mul_mat_vec_q_switch_type(
-        const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst,
+        const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, const void * vgate, float * dst,
         const int ncols_x, const int nrows_x, const int ncols_dst,
         const int stride_row_x, const int stride_col_y, const int stride_col_dst,
         const int nchannels_x, const int nchannels_y, const int nchannels_dst,
         const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
         const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
-        cudaStream_t stream) {
+        const ggml_glu_op glu_op, cudaStream_t stream) {
     switch (type_x) {
         case GGML_TYPE_Q4_0:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q4_1:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q5_0:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q5_1:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q8_0:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_MXFP4:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_MXFP4, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q2_K:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q3_K:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q4_K:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q5_K:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_Q6_K:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ2_XXS:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ2_XS:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ2_S:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ3_XXS:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ1_S:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ1_M:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ4_NL:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ4_XS:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         case GGML_TYPE_IQ3_S:
-            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
-                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S, has_gate>
+                (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
                  nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
                  nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
-                 stream);
+                 glu_op, stream);
             break;
         default:
             GGML_ABORT("fatal error");
@@ -484,10 +521,14 @@ static void mul_mat_vec_q_switch_type(
 }
 
 void ggml_cuda_mul_mat_vec_q(
-        ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+        ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst,
+        const ggml_tensor * src0_gate, const ggml_tensor * glu) {
     GGML_ASSERT(        src1->type == GGML_TYPE_F32);
     GGML_ASSERT(        dst->type  == GGML_TYPE_F32);
     GGML_ASSERT(!ids || ids->type  == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
+    GGML_ASSERT(!src0_gate || src0_gate->type == src0->type);
+    GGML_ASSERT(!src0_gate || ggml_are_same_shape(src0_gate, src0));
+    GGML_ASSERT(!src0_gate || glu != nullptr);
 
     GGML_TENSOR_BINARY_OP_LOCALS;
 
@@ -504,9 +545,12 @@ void ggml_cuda_mul_mat_vec_q(
 
     GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1.
 
+    GGML_ASSERT(!src0_gate || ggml_are_same_stride(src0_gate, src0));
+
     const float   * src1_d =       (const float   *) src1->data;
     const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
     float         *  dst_d =       (float         *)  dst->data;
+    const void    * gate_d = src0_gate ? src0_gate->data : nullptr;
 
     // If src0 is a temporary compute buffer, clear any potential padding.
     if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) {
@@ -548,11 +592,20 @@ void ggml_cuda_mul_mat_vec_q(
     const int64_t stride_channel_dst = ids ? s1   : s2;
     const int64_t stride_channel_y   = ids ? s11  : s12;
 
-    mul_mat_vec_q_switch_type(
-        src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00,
-        ne01,              ncols_dst,     s01, stride_col_y,     stride_col_dst,
-        ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
-        ne03,              ne3,           s03, s13,              s3,                 stream);
+    if (src0_gate) {
+        const ggml_glu_op glu_op = ggml_get_glu_op(glu);
+        mul_mat_vec_q_switch_type<true>(
+            src0->data, src0->type, src1_q8_1.get(), ids_d, gate_d, dst_d, ne00,
+            ne01,              ncols_dst,     s01, stride_col_y,     stride_col_dst,
+            ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
+            ne03,              ne3,           s03, s13,              s3,                 glu_op, stream);
+    } else {
+        mul_mat_vec_q_switch_type<>(
+            src0->data, src0->type, src1_q8_1.get(), ids_d, nullptr, dst_d, ne00,
+            ne01,              ncols_dst,     s01, stride_col_y,     stride_col_dst,
+            ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
+            ne03,              ne3,           s03, s13,              s3,                 ggml_glu_op::GGML_GLU_OP_SWIGLU, stream);
+    }
 }
 
 void ggml_cuda_op_mul_mat_vec_q(
@@ -578,9 +631,9 @@ void ggml_cuda_op_mul_mat_vec_q(
     const int stride_row_x = ne00 / ggml_blck_size(src0->type);
     const int stride_col_y = src1_padded_row_size / QK8_1;
 
-    mul_mat_vec_q_switch_type(
-        src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
-        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
+    mul_mat_vec_q_switch_type<>(
+        src0_dd_i, src0->type, src1_ddq_i, nullptr, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ggml_glu_op::GGML_GLU_OP_SWIGLU, stream);
 
     GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size);
 }
diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh
index 39dc7d33eb5ac..bc59e1f0562bc 100644
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -3,7 +3,7 @@
 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 
 void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_tensor * gate = nullptr, const ggml_tensor * glu = nullptr);
 
 void ggml_cuda_op_mul_mat_vec_q(
     ggml_backend_cuda_context & ctx,
diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu
index 3c564566a51ff..eeeeacc11a647 100644
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -18,10 +18,7 @@ static __device__ __forceinline__ float op_step(float x) {
 }
 
 static __device__ __forceinline__ float op_gelu(float x) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-
-    return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
+    return ggml_cuda_op_gelu_single(x);
 }
 
 static __device__ __forceinline__ float op_gelu_erf(float x) {
@@ -37,7 +34,7 @@ static __device__ __forceinline__ float op_gelu_quick(float x) {
 }
 
 static __device__ __forceinline__ float op_silu(float x) {
-    return x / (1.0f + expf(-x));
+    return ggml_cuda_op_silu_single(x);
 }
 
 static __device__ __forceinline__ float op_tanh(float x) {
diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh
index 8e7644fcd9a48..2d9e7fa68b7c1 100644
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -1,3 +1,4 @@
+#pragma once
 #include "common.cuh"
 
 #define CUDA_NEG_BLOCK_SIZE 256
@@ -75,3 +76,14 @@ void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 
 void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
+
+__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) {
+    return x / (1.0f + expf(-x));
+}
+
+__device__ __forceinline__ float ggml_cuda_op_gelu_single(float x) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+
+    return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x)));
+}
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index f29a1e98c9103..2128ab19482db 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn(
             GGML_ABORT("fatal error");
     }
 
+    //expand here so that we can fuse ffn gate
+    ggml_build_forward_expand(gf, cur);
+
     if (gate && type_gate == LLM_FFN_PAR) {
         cur = ggml_mul(ctx0, cur, tmp);
         cb(cur, "ffn_gate_par", il);
@@ -1061,6 +1064,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             GGML_ABORT("fatal error");
     }
 
+    //expand here so that we can fuse ffn gate
+    ggml_build_forward_expand(gf, cur);
+
     experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d5c5a2a6656ee..ee9b3fad288b7 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4584,6 +4584,119 @@ struct test_topk_moe: public test_case {
     }
 };
 
+struct test_fused_ffn_gate : public test_case {
+    const ggml_type type;
+    const ggml_glu_op glu_op;
+    const int64_t m;
+    const int64_t n;
+    const int64_t k;
+    const bool use_id;
+    const int n_mats;
+    const int n_used;
+    const bool b; // broadcast b matrix (only for use_id)
+
+    test_fused_ffn_gate(ggml_type type, ggml_glu_op op, int64_t m, int64_t n, int64_t k,
+                        bool use_id = false, int n_mats = 1, int n_used = 1, bool b = false)
+    : type(type), glu_op(op), m(m), n(n), k(k), use_id(use_id), n_mats(n_mats), n_used(n_used), b(b) {
+        if (use_id) {
+            GGML_ASSERT(n_used <= n_mats);
+        }
+    }
+
+    std::string vars() override {
+        if (use_id) {
+            return VARS_TO_STR9(type, glu_op, m, n, k, use_id, n_mats, n_used, b);
+        }
+        return VARS_TO_STR5(type, glu_op, m, n, k);
+    }
+
+    std::string op_desc(ggml_tensor * t) override {
+        GGML_UNUSED(t);
+        return "FUSED_FFN_GATE";
+    }
+
+    bool run_whole_graph() override { return true; }
+
+    ggml_tensor * build_graph(ggml_context * ctx) override {
+        if (!use_id) {
+            // Original implementation
+            std::array<int64_t, 4> ne = {k, m, 1, 1};
+            std::array<int64_t, 4> ne0 = {k, n, 1, 1};
+
+            ggml_tensor * cur = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data());
+            ggml_tensor * gate = ggml_new_tensor(ctx, type, 4, ne0.data());
+            ggml_tensor * up   = ggml_new_tensor(ctx, type, 4, ne0.data());
+
+            ggml_tensor * ffn_up   = ggml_mul_mat(ctx, up, cur);
+            ggml_tensor * ffn_gate = ggml_mul_mat(ctx, gate, cur);
+
+            ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
+
+            ggml_set_name(out, "out");
+            return out;
+        } else {
+            // mul_mat_id implementation
+            ggml_tensor * gates = ggml_new_tensor_3d(ctx, type, k, n, n_mats);
+            ggml_set_name(gates, "gates");
+
+            ggml_tensor * ups = ggml_new_tensor_3d(ctx, type, k, n, n_mats);
+            ggml_set_name(ups, "ups");
+
+            ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, m);
+            ggml_set_name(ids, "ids");
+            if (n_used != n_mats) {
+                ids = ggml_view_2d(ctx, ids, n_used, m, ids->nb[1], 0);
+                ggml_set_name(ids, "view_of_ids");
+            }
+
+            ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, this->b ? 1 : n_used, m);
+            ggml_set_name(cur, "cur");
+
+            ggml_tensor * ffn_up   = ggml_mul_mat_id(ctx, ups, cur, ids);
+            ggml_tensor * ffn_gate = ggml_mul_mat_id(ctx, gates, cur, ids);
+
+            ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op);
+
+            ggml_set_name(out, "out");
+            return out;
+        }
+    }
+
+    void initialize_tensors(ggml_context * ctx) override {
+        if (!use_id) {
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                init_tensor_uniform(t);
+            }
+        } else {
+            std::random_device rd;
+            std::default_random_engine rng(rd());
+            for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+                if (t->type == GGML_TYPE_I32) {
+                    if (ggml_is_view_op(t->op)) { continue; }
+                    // ids
+                    for (int64_t r = 0; r < ggml_nrows(t); r++) {
+                        std::vector<int32_t> data(t->ne[0]);
+                        for (int i = 0; i < t->ne[0]; i++) {
+                            data[i] = i % n_mats;
+                        }
+                        std::shuffle(data.begin(), data.end(), rng);
+                        ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t));
+                    }
+                } else {
+                    init_tensor_uniform(t);
+                }
+            }
+        }
+    }
+
+    double max_nmse_err() override {
+        if (glu_op == GGML_GLU_OP_GEGLU) {
+            return 1e-2;
+        }
+        return 1e-3;
+    }
+};
+
 // GGML_OP_SUM
 struct test_sum : public test_case {
     const ggml_type type;
@@ -6837,6 +6950,19 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3}));
     test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3}));
 
+    for (ggml_type type : base_types) {
+        for (bool use_id : {false, true}) {
+            for (bool b : {false, true}) {
+                if (!use_id && b) {
+                    continue;
+                }
+                for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) {
+                    test_cases.emplace_back(new test_fused_ffn_gate(type, glu_op, 1, 32, 256, use_id, 16, 8, b));
+                }
+            }
+        }
+    }
+
     for (bool with_norm : {false, true}) {
         test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm));
         test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));