diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 75fd6db14c514..1c283bc1543b8 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2007,6 +2007,95 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co } } +static bool ggml_cuda_should_fuse_mul_mat(const ggml_tensor * ffn_up, + const ggml_tensor * ffn_gate, + const ggml_tensor * glu) { + bool is_mul_mat = ffn_up->op == GGML_OP_MUL_MAT && ffn_gate->op == GGML_OP_MUL_MAT && glu->op == GGML_OP_GLU; + bool is_mul_mat_id = + ffn_up->op == GGML_OP_MUL_MAT_ID && ffn_gate->op == GGML_OP_MUL_MAT_ID && glu->op == GGML_OP_GLU; + + GGML_ASSERT(ffn_up && ffn_gate && glu); + + if (!is_mul_mat && !is_mul_mat_id) { + return false; + } + + if (ffn_up->src[0]->type != ffn_gate->src[0]->type || !ggml_are_same_shape(ffn_up->src[0], ffn_gate->src[0]) || + !ggml_are_same_stride(ffn_up->src[0], ffn_gate->src[0])) { + return false; + } + + if (ffn_up->src[1] != ffn_gate->src[1]) { + return false; + } + + if (ffn_up->src[2] && (ffn_up->src[2] != ffn_gate->src[2])) { + return false; + } + + if (glu->src[0] != ffn_gate && glu->src[1] != ffn_up) { + return false; + } + + static constexpr std::array valid_glu_ops = { GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU }; + + if (std::find(valid_glu_ops.begin(), valid_glu_ops.end(), ggml_get_glu_op(glu)) == valid_glu_ops.end()) { + return false; + } + + if (const bool swapped = ggml_get_op_params_i32(glu, 1); swapped) { + return false; + } + + const bool split = ggml_backend_buft_is_cuda_split(ffn_up->src[0]->buffer->buft) || + ggml_backend_buft_is_cuda_split(ffn_gate->src[0]->buffer->buft); + + //TODO: add support for fusion for split buffers + if (split) { + return false; + } + + return true; +} + +static bool ggml_cuda_should_fuse_mul_mat_vec_f(const ggml_tensor * tensor) { + ggml_tensor * src0 = tensor->src[0]; + ggml_tensor * src1 = tensor->src[1]; + const ggml_tensor * dst = tensor; + + bool use_mul_mat_vec_f = + (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16) && + src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32; + + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; + use_mul_mat_vec_f = use_mul_mat_vec_f && ggml_cuda_should_use_mmvf(src0->type, cc, src0->ne, src1->ne[1]); + + if (tensor->op == GGML_OP_MUL_MAT_ID) { + use_mul_mat_vec_f = use_mul_mat_vec_f && dst->ne[2] == 1; + } + + return use_mul_mat_vec_f; +} + +static bool ggml_cuda_should_fuse_mul_mat_vec_q(const ggml_tensor * tensor) { + ggml_tensor * src0 = tensor->src[0]; + ggml_tensor * src1 = tensor->src[1]; + const ggml_tensor * dst = tensor; + + const bool bad_padding_clear = ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE && + ggml_nbytes(src0) != ggml_backend_buffer_get_alloc_size(src0->buffer, src0) && + src0->view_src; + + bool use_mul_mat_vec_q = ggml_is_quantized(src0->type) && !bad_padding_clear && src1->type == GGML_TYPE_F32 && + dst->type == GGML_TYPE_F32 && src1->ne[1] <= MMVQ_MAX_BATCH_SIZE; + + if (tensor->op == GGML_OP_MUL_MAT_ID) { + use_mul_mat_vec_q = use_mul_mat_vec_q && dst->ne[2] == 1; + } + + return use_mul_mat_vec_q; +} + static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft); @@ -2855,6 +2944,25 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, } } + std::initializer_list mul_mat_id_glu_ops = { GGML_OP_MUL_MAT_ID, GGML_OP_MUL_MAT_ID, GGML_OP_GLU }; + + std::initializer_list mul_mat_glu_ops = { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT, GGML_OP_GLU }; + + if (ops.size() == 3 && (std::equal(ops.begin(), ops.end(), mul_mat_id_glu_ops.begin()) || + std::equal(ops.begin(), ops.end(), mul_mat_glu_ops.begin()))) { + if (node_idx + 2 >= cgraph->n_nodes) { + return false; + } + + const ggml_tensor * ffn_gate = cgraph->nodes[node_idx]; + const ggml_tensor * ffn_up = cgraph->nodes[node_idx + 1]; + const ggml_tensor * glu = cgraph->nodes[node_idx + 2]; + + if (ggml_cuda_should_fuse_mul_mat(ffn_up, ffn_gate, glu)) { + return true; + } + } + if (!ggml_can_fuse(cgraph, node_idx, ops)) { return false; } @@ -2992,6 +3100,36 @@ static void evaluate_and_capture_cuda_graph(ggml_backend_cuda_context * cuda_ctx } } + bool fused_mul_mat_vec = false; + + for (ggml_op op : { GGML_OP_MUL_MAT, GGML_OP_MUL_MAT_ID }) { + if (ggml_cuda_can_fuse(cgraph, i, { op, op, GGML_OP_GLU }, {})) { + ggml_tensor * glu = cgraph->nodes[i + 2]; + ggml_tensor * gate = glu->src[0]; + ggml_tensor * up = glu->src[1]; + + const ggml_tensor * src0 = up->src[0]; + const ggml_tensor * src1 = up->src[1]; + const ggml_tensor * ids = up->src[2]; + + if (ggml_cuda_should_fuse_mul_mat_vec_f(up)) { + ggml_cuda_mul_mat_vec_f(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu); + fused_mul_mat_vec = true; + break; + } + + if (ggml_cuda_should_fuse_mul_mat_vec_q(up)) { + ggml_cuda_mul_mat_vec_q(*cuda_ctx, src0, src1, ids, glu, gate->src[0], glu); + fused_mul_mat_vec = true; + break; + } + } + } + + if (fused_mul_mat_vec) { + i += 2; + continue; + } if (ggml_cuda_can_fuse(cgraph, i, { GGML_OP_RMS_NORM, GGML_OP_MUL, GGML_OP_ADD}, {})) { ggml_cuda_op_rms_norm_fused_add(*cuda_ctx, node, cgraph->nodes[i+1], cgraph->nodes[i+2]); diff --git a/ggml/src/ggml-cuda/mmvf.cu b/ggml/src/ggml-cuda/mmvf.cu index 57ab839393aa0..04682cc5f43b5 100644 --- a/ggml/src/ggml-cuda/mmvf.cu +++ b/ggml/src/ggml-cuda/mmvf.cu @@ -2,13 +2,14 @@ #include "common.cuh" #include "convert.cuh" #include "mmvf.cuh" +#include "unary.cuh" -template +template static __global__ void mul_mat_vec_f( - const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst, + const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, const T * __restrict__ gate_x, float * __restrict__ dst, const int ncols2, const int nchannels_y, const int stride_row, const int stride_col_y2, const int stride_col_dst, const uint3 channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, - const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) { + const uint3 sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, const ggml_glu_op glu_op) { const int row = blockIdx.x; const int channel_dst = blockIdx.y; const int channel_x = ids ? ids[channel_dst] : fastdiv((uint32_t) channel_dst, channel_ratio); @@ -24,58 +25,85 @@ static __global__ void mul_mat_vec_f( y += int64_t(sample_y) *stride_sample_y + channel_y *stride_channel_y; dst += int64_t(sample_dst)*stride_sample_dst + channel_dst*stride_channel_dst; + if constexpr (has_gate) { + gate_x += int64_t(sample_x) *stride_sample_x + channel_x *stride_channel_x + row*stride_row; + } + const float2 * y2 = (const float2 *) y; extern __shared__ char data_mmv[]; float * buf_iw = (float *) data_mmv; + float * buf_iw_gate = has_gate ? (float *) (data_mmv + warp_size*sizeof(float)) : nullptr; if (block_size > warp_size) { if (tid < warp_size) { buf_iw[tid] = 0.0f; + if constexpr (has_gate) { + buf_iw_gate[tid] = 0.0f; + } } __syncthreads(); } float sumf[ncols_dst] = {0.0f}; + float sumf_gate[ncols_dst] = {0.0f}; if constexpr (std::is_same_v) { const float2 * x2 = (const float2 *) x; + const float2 * gate_x2 = has_gate ? (const float2 *) gate_x : nullptr; for (int col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = x2[col2]; + const float2 tmpx_gate = has_gate ? gate_x2[col2] : make_float2(0.0f, 0.0f); #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x); ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y); + + if constexpr (has_gate) { + ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x); + ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y); + } } } } else if constexpr (std::is_same_v) { const half2 * x2 = (const half2 *) x; + const half2 * gate_x2 = has_gate ? (const half2 *) gate_x : nullptr; if (std::is_same_v) { for (int col2 = tid; col2 < ncols2; col2 += block_size) { const float2 tmpx = __half22float2(x2[col2]); - + const float2 tmpx_gate = has_gate ? __half22float2(gate_x2[col2]) : make_float2(0.0f, 0.0f); #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x); ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y); + + if constexpr (has_gate) { + ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x); + ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y); + } } } } else { #ifdef FP16_AVAILABLE half2 sumh2[ncols_dst] = {{0.0f, 0.0f}}; + half2 sumh2_gate[ncols_dst] = {{0.0f, 0.0f}}; for (int col2 = tid; col2 < ncols2; col2 += block_size) { const half2 tmpx = x2[col2]; - + const half2 tmpx_gate = has_gate ? gate_x2[col2] : make_half2(0.0f, 0.0f); #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; sumh2[j] += tmpx * make_half2(tmpy.x, tmpy.y); + + if constexpr (has_gate) { + sumh2_gate[j] += tmpx_gate * make_half2(tmpy.x, tmpy.y); + } } } @@ -83,6 +111,12 @@ static __global__ void mul_mat_vec_f( for (int j = 0; j < ncols_dst; ++j) { sumf[j] = __low2float(sumh2[j]) + __high2float(sumh2[j]); } + + if constexpr (has_gate) { + for (int j = 0; j < ncols_dst; ++j) { + sumf_gate[j] = __low2float(sumh2_gate[j]) + __high2float(sumh2_gate[j]); + } + } #else NO_DEVICE_CODE; #endif // FP16_AVAILABLE @@ -91,8 +125,10 @@ static __global__ void mul_mat_vec_f( //TODO: add support for ggml_cuda_mad for hip_bfloat162 #if defined(GGML_USE_HIP) const int * x2 = (const int *) x; + const int * gate_x2 = has_gate ? (const int *) gate_x : nullptr; for (int col2 = tid; col2 < ncols2; col2 += block_size) { const int tmpx = x2[col2]; + const int tmpx_gate = has_gate ? gate_x2[col2] : 0; #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; @@ -100,17 +136,34 @@ static __global__ void mul_mat_vec_f( const float tmpx1 = ggml_cuda_cast(reinterpret_cast(&tmpx)[1]); ggml_cuda_mad(sumf[j], tmpx0, tmpy.x); ggml_cuda_mad(sumf[j], tmpx1, tmpy.y); + + if constexpr (has_gate) { + const float tmpx0 = ggml_cuda_cast(reinterpret_cast(&tmpx_gate)[0]); + const float tmpx1 = ggml_cuda_cast(reinterpret_cast(&tmpx_gate)[1]); + ggml_cuda_mad(sumf_gate[j], tmpx0, tmpy.x); + ggml_cuda_mad(sumf_gate[j], tmpx1, tmpy.y); + } } } #else const nv_bfloat162 * x2 = (const nv_bfloat162 *) x; + const nv_bfloat162 * gate_x2 = has_gate ? (const nv_bfloat162 *) gate_x : nullptr; for (int col2 = tid; col2 < ncols2; col2 += block_size) { const nv_bfloat162 tmpx = x2[col2]; + nv_bfloat162 tmpx_gate; + if constexpr (has_gate) { + tmpx_gate = gate_x2[col2]; + } #pragma unroll for (int j = 0; j < ncols_dst; ++j) { const float2 tmpy = y2[j*stride_col_y2 + col2]; ggml_cuda_mad(sumf[j], tmpx.x, tmpy.x); ggml_cuda_mad(sumf[j], tmpx.y, tmpy.y); + + if constexpr (has_gate) { + ggml_cuda_mad(sumf_gate[j], tmpx_gate.x, tmpy.x); + ggml_cuda_mad(sumf_gate[j], tmpx_gate.y, tmpy.y); + } } } #endif @@ -122,13 +175,25 @@ static __global__ void mul_mat_vec_f( for (int j = 0; j < ncols_dst; ++j) { sumf[j] = warp_reduce_sum(sumf[j]); + if constexpr (has_gate) { + sumf_gate[j] = warp_reduce_sum(sumf_gate[j]); + } + if (block_size > warp_size) { buf_iw[tid/warp_size] = sumf[j]; + if constexpr (has_gate) { + buf_iw_gate[tid/warp_size] = sumf_gate[j]; + } __syncthreads(); if (tid < warp_size) { sumf[j] = buf_iw[tid]; sumf[j] = warp_reduce_sum(sumf[j]); + if constexpr (has_gate) { + sumf_gate[j] = buf_iw_gate[tid]; + sumf_gate[j] = warp_reduce_sum(sumf_gate[j]); + } } + if (j < ncols_dst) { __syncthreads(); } @@ -139,18 +204,35 @@ static __global__ void mul_mat_vec_f( return; } - dst[tid*stride_col_dst + row] = sumf[tid]; + if constexpr (has_gate) { + float gate_value = sumf_gate[tid]; + float gated_value; + switch (glu_op) { + case GGML_GLU_OP_SWIGLU: + gated_value = ggml_cuda_op_silu_single(gate_value); + break; + case GGML_GLU_OP_GEGLU: + gated_value = ggml_cuda_op_gelu_single(gate_value); + break; + default: + gated_value = gate_value; + break; + } + dst[tid*stride_col_dst + row] = sumf[tid] * gated_value; + } else { + dst[tid*stride_col_dst + row] = sumf[tid]; + } } -template +template static void launch_mul_mat_vec_f_cuda( - const T * x, const float * y, const int32_t * ids, float * dst, + const T * x, const float * y, const int32_t * ids, const T * gate_x, float * dst, const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { + const ggml_glu_op glu_op, cudaStream_t stream) { GGML_ASSERT(ncols % 2 == 0); GGML_ASSERT(stride_row % 2 == 0); GGML_ASSERT(stride_col_y % 2 == 0); @@ -176,57 +258,57 @@ static void launch_mul_mat_vec_f_cuda( } } - const int nbytes_shared = warp_size*sizeof(float); + const int nbytes_shared = warp_size*sizeof(float) + (has_gate ? warp_size*sizeof(float) : 0); const dim3 block_nums(nrows, nchannels_dst, nsamples_dst); const dim3 block_dims(block_size_best, 1, 1); switch (block_size_best) { case 32: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 64: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 96: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 128: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 160: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 192: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 224: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 256: { - mul_mat_vec_f<<>> - (x, y, ids, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, + mul_mat_vec_f<<>> + (x, y, ids, gate_x, dst, ncols/2, nchannels_y, stride_row, stride_col_y/2, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; default: { GGML_ABORT("fatal error"); @@ -234,63 +316,63 @@ static void launch_mul_mat_vec_f_cuda( } } -template +template static void mul_mat_vec_f_cuda_switch_ncols_dst( - const T * x, const float * y, const int32_t * ids, float * dst, + const T * x, const float * y, const int32_t * ids, const T * gate_x, float * dst, const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, const int64_t stride_row, const int64_t stride_col_y, const int64_t stride_col_dst, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, - cudaStream_t stream) { + const ggml_glu_op glu_op, cudaStream_t stream) { switch (ncols_dst) { case 1: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; case 2: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; case 3: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; case 4: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; case 5: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; case 6: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; case 7: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; case 8: - launch_mul_mat_vec_f_cuda - (x, y, ids, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, + launch_mul_mat_vec_f_cuda + (x, y, ids, gate_x, dst, ncols, nrows, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op, stream); break; default: GGML_ABORT("fatal error"); @@ -300,29 +382,48 @@ static void mul_mat_vec_f_cuda_switch_ncols_dst( template static void mul_mat_vec_f_cuda( - const T * x, const float * y, const int32_t * ids, float * dst, + const T * x, const float * y, const int32_t * ids, const T * gate_x, ggml_glu_op op, float * dst, const int64_t ncols, const int64_t nrows, const int64_t ncols_dst, const int64_t stride_row, const int64_t stride_col_y, const int stride_col_dst, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x, const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst, enum ggml_prec prec, cudaStream_t stream) { - if constexpr(std::is_same_v) { - if (prec == GGML_PREC_DEFAULT) { - mul_mat_vec_f_cuda_switch_ncols_dst - (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); - return; + + if (gate_x) { + if constexpr(std::is_same_v) { + if (prec == GGML_PREC_DEFAULT) { + mul_mat_vec_f_cuda_switch_ncols_dst + (x, y, ids, gate_x, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream); + return; + } } + mul_mat_vec_f_cuda_switch_ncols_dst + (x, y, ids, gate_x, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream); + } + else { + if constexpr(std::is_same_v) { + if (prec == GGML_PREC_DEFAULT) { + mul_mat_vec_f_cuda_switch_ncols_dst + (x, y, ids, nullptr, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream); + return; + } + } + mul_mat_vec_f_cuda_switch_ncols_dst + (x, y, ids, nullptr, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, + nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, + stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, op, stream); } - mul_mat_vec_f_cuda_switch_ncols_dst - (x, y, ids, dst, ncols, nrows, ncols_dst, stride_row, stride_col_y, stride_col_dst, - nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, - stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream); } -void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { +void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, + const ggml_tensor * src0_gate, const ggml_tensor * glu) { GGML_ASSERT( src1->type == GGML_TYPE_F32); GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -341,6 +442,8 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type)); GGML_ASSERT( nb0 == ts_dst); + GGML_ASSERT(!src0_gate || ggml_are_same_stride(src0_gate, src0)); + const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc; const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32; @@ -367,22 +470,27 @@ void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor GGML_ASSERT(!ids || ncols_dst == 1); + ggml_glu_op glu_op = glu ? ggml_get_glu_op(glu) : ggml_glu_op::GGML_GLU_OP_SWIGLU; + switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0->data; - mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, + const float * src0_gate_d = src0_gate ? (const float *) src0_gate->data : nullptr; + mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, src0_gate_d, glu_op, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0->data; - mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, + const half * src0_gate_d = src0_gate ? (const half *) src0_gate->data : nullptr; + mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, src0_gate_d, glu_op, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data; - mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, + const nv_bfloat16 * src0_gate_d = src0_gate ? (const nv_bfloat16 *) src0_gate->data : nullptr; + mul_mat_vec_f_cuda(src0_d, src1_d, ids_d, src0_gate_d, glu_op, dst_d, ne00, ne01, ncols_dst, s01, s11, s1, ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, ne03, ne3, s03, s13, s3, prec, ctx.stream()); } break; @@ -429,19 +537,22 @@ void ggml_cuda_op_mul_mat_vec_f( switch (src0->type) { case GGML_TYPE_F32: { const float * src0_d = (const float *) src0_dd_i; - mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, + const float * src0_gate_d = nullptr; + mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, src0_gate_d, ggml_glu_op::GGML_GLU_OP_SWIGLU, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_F16: { const half * src0_d = (const half *) src0_dd_i; - mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, + const half * src0_gate_d = nullptr; + mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, src0_gate_d, ggml_glu_op::GGML_GLU_OP_SWIGLU, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; case GGML_TYPE_BF16: { const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i; - mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, + const nv_bfloat16 * src0_gate_d = nullptr; + mul_mat_vec_f_cuda(src0_d, src1_ddf_i, nullptr, src0_gate_d, ggml_glu_op::GGML_GLU_OP_SWIGLU, dst_dd_i, ne00, row_diff, src1_ncols, stride_row, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream); } break; diff --git a/ggml/src/ggml-cuda/mmvf.cuh b/ggml/src/ggml-cuda/mmvf.cuh index 1da460992e784..4dc4ad42759d2 100644 --- a/ggml/src/ggml-cuda/mmvf.cuh +++ b/ggml/src/ggml-cuda/mmvf.cuh @@ -1,6 +1,7 @@ #include "common.cuh" -void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); +void ggml_cuda_mul_mat_vec_f(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, + const ggml_tensor * src0_gate = nullptr, const ggml_tensor * glu = nullptr); void ggml_cuda_op_mul_mat_vec_f( ggml_backend_cuda_context & ctx, diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu index 3bf0c9ed25038..c99234693bf2f 100644 --- a/ggml/src/ggml-cuda/mmvq.cu +++ b/ggml/src/ggml-cuda/mmvq.cu @@ -1,5 +1,6 @@ #include "mmvq.cuh" #include "quantize.cuh" +#include "unary.cuh" #include "vecdotq.cuh" #include @@ -136,15 +137,15 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int return 1; } -template +template // tell the compiler to use as many registers as it wants, see nwarps definition below __launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1) static __global__ void mul_mat_vec_q( - const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst, + const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, const void * __restrict__ vgate, float * __restrict__ dst, const uint32_t ncols_x, const uint3 nchannels_y, const uint32_t stride_row_x, const uint32_t stride_col_y, const uint32_t stride_col_dst, const uint3 channel_ratio, const uint32_t stride_channel_x, const uint32_t stride_channel_y, const uint32_t stride_channel_dst, const uint3 sample_ratio, - const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst) { + const uint32_t stride_sample_x, const uint32_t stride_sample_y, const uint32_t stride_sample_dst, const ggml_glu_op glu_op) { constexpr int qk = ggml_cuda_type_traits::qk; constexpr int qi = ggml_cuda_type_traits::qi; @@ -171,6 +172,7 @@ static __global__ void mul_mat_vec_q( // partial sum for each thread float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}}; + float tmp_gate[ncols_dst][rows_per_cuda_block] = {{0.0f}}; const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y; const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x; @@ -187,17 +189,28 @@ static __global__ void mul_mat_vec_q( for (int i = 0; i < rows_per_cuda_block; ++i) { tmp[j][i] += vec_dot_q_cuda( vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs); + if constexpr (has_gate) { + tmp_gate[j][i] += vec_dot_q_cuda( + vgate, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs); + } } } } __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; + __shared__ float tmp_shared_gate[(has_gate && (nwarps-1 > 0)) ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size]; + if constexpr (!has_gate) { + (void) tmp_shared_gate; + } if (threadIdx.y > 0) { #pragma unroll for (int j = 0; j < ncols_dst; ++j) { #pragma unroll for (int i = 0; i < rows_per_cuda_block; ++i) { tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i]; + if constexpr (has_gate) { + tmp_shared_gate[threadIdx.y-1][j][i][threadIdx.x] = tmp_gate[j][i]; + } } } } @@ -216,12 +229,35 @@ static __global__ void mul_mat_vec_q( #pragma unroll for (int l = 0; l < nwarps-1; ++l) { tmp[j][i] += tmp_shared[l][j][i][threadIdx.x]; + if constexpr (has_gate) { + tmp_gate[j][i] += tmp_shared_gate[l][j][i][threadIdx.x]; + } } tmp[j][i] = warp_reduce_sum(tmp[j][i]); + if constexpr (has_gate) { + tmp_gate[j][i] = warp_reduce_sum(tmp_gate[j][i]); + } } if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || uint32_t(row0 + threadIdx.x) < stride_col_dst)) { - dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x]; + float result = tmp[j][threadIdx.x]; + if constexpr (has_gate) { + float gate_value = tmp_gate[j][threadIdx.x]; + float gated_value; + switch (glu_op) { + case GGML_GLU_OP_SWIGLU: + gated_value = ggml_cuda_op_silu_single(gate_value); + break; + case GGML_GLU_OP_GEGLU: + gated_value = ggml_cuda_op_gelu_single(gate_value); + break; + default: + gated_value = gate_value; + break; + } + result = result * gated_value; + } + dst[j*stride_col_dst + threadIdx.x] = result; } } } @@ -235,15 +271,15 @@ static std::pair calc_launch_params( return {block_nums, block_dims}; } -template +template static void mul_mat_vec_q_switch_ncols_dst( - const void * vx, const void * vy, const int32_t * ids, float * dst, + const void * vx, const void * vy, const int32_t * ids, const void * vgate, float * dst, const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int stride_col_y, const int stride_col_dst, const int nchannels_x, const int nchannels_y, const int nchannels_dst, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, - cudaStream_t stream) { + const ggml_glu_op glu_op, cudaStream_t stream) { GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0); GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE); @@ -261,66 +297,66 @@ static void mul_mat_vec_q_switch_ncols_dst( case 1: { constexpr int c_ncols_dst = 1; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 2: { constexpr int c_ncols_dst = 2; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 3: { constexpr int c_ncols_dst = 3; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 4: { constexpr int c_ncols_dst = 4; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 5: { constexpr int c_ncols_dst = 5; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 6: { constexpr int c_ncols_dst = 6; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 7: { constexpr int c_ncols_dst = 7; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; case 8: { constexpr int c_ncols_dst = 8; std::pair dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id); - mul_mat_vec_q<<>> - (vx, vy, ids, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q<<>> + (vx, vy, ids, vgate, dst, ncols_x, nchannels_y_fd, stride_row_x, stride_col_y, stride_col_dst, channel_ratio_fd, stride_channel_x, stride_channel_y, stride_channel_dst, - sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst); + sample_ratio_fd, stride_sample_x, stride_sample_y, stride_sample_dst, glu_op); } break; default: GGML_ABORT("fatal error"); @@ -328,154 +364,155 @@ static void mul_mat_vec_q_switch_ncols_dst( } } +template static void mul_mat_vec_q_switch_type( - const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst, + const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, const void * vgate, float * dst, const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_row_x, const int stride_col_y, const int stride_col_dst, const int nchannels_x, const int nchannels_y, const int nchannels_dst, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst, const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst, - cudaStream_t stream) { + const ggml_glu_op glu_op, cudaStream_t stream) { switch (type_x) { case GGML_TYPE_Q4_0: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q4_1: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q5_0: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q5_1: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q8_0: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_MXFP4: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q2_K: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q3_K: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q4_K: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q5_K: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_Q6_K: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ2_XXS: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ2_XS: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ2_S: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ3_XXS: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ1_S: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ1_M: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ4_NL: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ4_XS: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; case GGML_TYPE_IQ3_S: - mul_mat_vec_q_switch_ncols_dst - (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, + mul_mat_vec_q_switch_ncols_dst + (vx, vy, ids, vgate, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, - stream); + glu_op, stream); break; default: GGML_ABORT("fatal error"); @@ -484,10 +521,14 @@ static void mul_mat_vec_q_switch_type( } void ggml_cuda_mul_mat_vec_q( - ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) { + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, + const ggml_tensor * src0_gate, const ggml_tensor * glu) { GGML_ASSERT( src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT(!ids || ids->type == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID. + GGML_ASSERT(!src0_gate || src0_gate->type == src0->type); + GGML_ASSERT(!src0_gate || ggml_are_same_shape(src0_gate, src0)); + GGML_ASSERT(!src0_gate || glu != nullptr); GGML_TENSOR_BINARY_OP_LOCALS; @@ -504,9 +545,12 @@ void ggml_cuda_mul_mat_vec_q( GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for batch size 1. + GGML_ASSERT(!src0_gate || ggml_are_same_stride(src0_gate, src0)); + const float * src1_d = (const float *) src1->data; const int32_t * ids_d = ids ? (const int32_t *) ids->data : nullptr; float * dst_d = (float *) dst->data; + const void * gate_d = src0_gate ? src0_gate->data : nullptr; // If src0 is a temporary compute buffer, clear any potential padding. if (ggml_backend_buffer_get_usage(src0->buffer) == GGML_BACKEND_BUFFER_USAGE_COMPUTE) { @@ -548,11 +592,20 @@ void ggml_cuda_mul_mat_vec_q( const int64_t stride_channel_dst = ids ? s1 : s2; const int64_t stride_channel_y = ids ? s11 : s12; - mul_mat_vec_q_switch_type( - src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00, - ne01, ncols_dst, s01, stride_col_y, stride_col_dst, - ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, - ne03, ne3, s03, s13, s3, stream); + if (src0_gate) { + const ggml_glu_op glu_op = ggml_get_glu_op(glu); + mul_mat_vec_q_switch_type( + src0->data, src0->type, src1_q8_1.get(), ids_d, gate_d, dst_d, ne00, + ne01, ncols_dst, s01, stride_col_y, stride_col_dst, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, glu_op, stream); + } else { + mul_mat_vec_q_switch_type<>( + src0->data, src0->type, src1_q8_1.get(), ids_d, nullptr, dst_d, ne00, + ne01, ncols_dst, s01, stride_col_y, stride_col_dst, + ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst, + ne03, ne3, s03, s13, s3, ggml_glu_op::GGML_GLU_OP_SWIGLU, stream); + } } void ggml_cuda_op_mul_mat_vec_q( @@ -578,9 +631,9 @@ void ggml_cuda_op_mul_mat_vec_q( const int stride_row_x = ne00 / ggml_blck_size(src0->type); const int stride_col_y = src1_padded_row_size / QK8_1; - mul_mat_vec_q_switch_type( - src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream); + mul_mat_vec_q_switch_type<>( + src0_dd_i, src0->type, src1_ddq_i, nullptr, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ggml_glu_op::GGML_GLU_OP_SWIGLU, stream); GGML_UNUSED_VARS(src1, dst, src1_ddf_i, src1_ncols, src1_padded_row_size); } diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh index 39dc7d33eb5ac..bc59e1f0562bc 100644 --- a/ggml/src/ggml-cuda/mmvq.cuh +++ b/ggml/src/ggml-cuda/mmvq.cuh @@ -3,7 +3,7 @@ #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels. void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx, - const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst); + const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, const ggml_tensor * gate = nullptr, const ggml_tensor * glu = nullptr); void ggml_cuda_op_mul_mat_vec_q( ggml_backend_cuda_context & ctx, diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 3c564566a51ff..eeeeacc11a647 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -18,10 +18,7 @@ static __device__ __forceinline__ float op_step(float x) { } static __device__ __forceinline__ float op_gelu(float x) { - const float GELU_COEF_A = 0.044715f; - const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; - - return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); + return ggml_cuda_op_gelu_single(x); } static __device__ __forceinline__ float op_gelu_erf(float x) { @@ -37,7 +34,7 @@ static __device__ __forceinline__ float op_gelu_quick(float x) { } static __device__ __forceinline__ float op_silu(float x) { - return x / (1.0f + expf(-x)); + return ggml_cuda_op_silu_single(x); } static __device__ __forceinline__ float op_tanh(float x) { diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index 8e7644fcd9a48..2d9e7fa68b7c1 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -1,3 +1,4 @@ +#pragma once #include "common.cuh" #define CUDA_NEG_BLOCK_SIZE 256 @@ -75,3 +76,14 @@ void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +__device__ __forceinline__ float ggml_cuda_op_silu_single(float x) { + return x / (1.0f + expf(-x)); +} + +__device__ __forceinline__ float ggml_cuda_op_gelu_single(float x) { + const float GELU_COEF_A = 0.044715f; + const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; + + return 0.5f * x * (1.0f + tanhf(SQRT_2_OVER_PI * x * (1.0f + GELU_COEF_A * x * x))); +} diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f29a1e98c9103..2128ab19482db 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -810,6 +810,9 @@ ggml_tensor * llm_graph_context::build_ffn( GGML_ABORT("fatal error"); } + //expand here so that we can fuse ffn gate + ggml_build_forward_expand(gf, cur); + if (gate && type_gate == LLM_FFN_PAR) { cur = ggml_mul(ctx0, cur, tmp); cb(cur, "ffn_gate_par", il); @@ -1061,6 +1064,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn( GGML_ABORT("fatal error"); } + //expand here so that we can fuse ffn gate + ggml_build_forward_expand(gf, cur); + experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens] cb(experts, "ffn_moe_down", il); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d5c5a2a6656ee..ee9b3fad288b7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4584,6 +4584,119 @@ struct test_topk_moe: public test_case { } }; +struct test_fused_ffn_gate : public test_case { + const ggml_type type; + const ggml_glu_op glu_op; + const int64_t m; + const int64_t n; + const int64_t k; + const bool use_id; + const int n_mats; + const int n_used; + const bool b; // broadcast b matrix (only for use_id) + + test_fused_ffn_gate(ggml_type type, ggml_glu_op op, int64_t m, int64_t n, int64_t k, + bool use_id = false, int n_mats = 1, int n_used = 1, bool b = false) + : type(type), glu_op(op), m(m), n(n), k(k), use_id(use_id), n_mats(n_mats), n_used(n_used), b(b) { + if (use_id) { + GGML_ASSERT(n_used <= n_mats); + } + } + + std::string vars() override { + if (use_id) { + return VARS_TO_STR9(type, glu_op, m, n, k, use_id, n_mats, n_used, b); + } + return VARS_TO_STR5(type, glu_op, m, n, k); + } + + std::string op_desc(ggml_tensor * t) override { + GGML_UNUSED(t); + return "FUSED_FFN_GATE"; + } + + bool run_whole_graph() override { return true; } + + ggml_tensor * build_graph(ggml_context * ctx) override { + if (!use_id) { + // Original implementation + std::array ne = {k, m, 1, 1}; + std::array ne0 = {k, n, 1, 1}; + + ggml_tensor * cur = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne.data()); + ggml_tensor * gate = ggml_new_tensor(ctx, type, 4, ne0.data()); + ggml_tensor * up = ggml_new_tensor(ctx, type, 4, ne0.data()); + + ggml_tensor * ffn_up = ggml_mul_mat(ctx, up, cur); + ggml_tensor * ffn_gate = ggml_mul_mat(ctx, gate, cur); + + ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op); + + ggml_set_name(out, "out"); + return out; + } else { + // mul_mat_id implementation + ggml_tensor * gates = ggml_new_tensor_3d(ctx, type, k, n, n_mats); + ggml_set_name(gates, "gates"); + + ggml_tensor * ups = ggml_new_tensor_3d(ctx, type, k, n, n_mats); + ggml_set_name(ups, "ups"); + + ggml_tensor * ids = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, n_mats, m); + ggml_set_name(ids, "ids"); + if (n_used != n_mats) { + ids = ggml_view_2d(ctx, ids, n_used, m, ids->nb[1], 0); + ggml_set_name(ids, "view_of_ids"); + } + + ggml_tensor * cur = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, k, this->b ? 1 : n_used, m); + ggml_set_name(cur, "cur"); + + ggml_tensor * ffn_up = ggml_mul_mat_id(ctx, ups, cur, ids); + ggml_tensor * ffn_gate = ggml_mul_mat_id(ctx, gates, cur, ids); + + ggml_tensor * out = ggml_glu_split(ctx, ffn_gate, ffn_up, glu_op); + + ggml_set_name(out, "out"); + return out; + } + } + + void initialize_tensors(ggml_context * ctx) override { + if (!use_id) { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t); + } + } else { + std::random_device rd; + std::default_random_engine rng(rd()); + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + if (t->type == GGML_TYPE_I32) { + if (ggml_is_view_op(t->op)) { continue; } + // ids + for (int64_t r = 0; r < ggml_nrows(t); r++) { + std::vector data(t->ne[0]); + for (int i = 0; i < t->ne[0]; i++) { + data[i] = i % n_mats; + } + std::shuffle(data.begin(), data.end(), rng); + ggml_backend_tensor_set(t, data.data(), r * t->nb[1], t->ne[0] * sizeof(int32_t)); + } + } else { + init_tensor_uniform(t); + } + } + } + } + + double max_nmse_err() override { + if (glu_op == GGML_GLU_OP_GEGLU) { + return 1e-2; + } + return 1e-3; + } +}; + // GGML_OP_SUM struct test_sum : public test_case { const ggml_type type; @@ -6837,6 +6950,19 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_opt_step_adamw(GGML_TYPE_F32, {10, 5, 4, 3})); test_cases.emplace_back(new test_opt_step_sgd(GGML_TYPE_F32, {10, 5, 4, 3})); + for (ggml_type type : base_types) { + for (bool use_id : {false, true}) { + for (bool b : {false, true}) { + if (!use_id && b) { + continue; + } + for (ggml_glu_op glu_op : {GGML_GLU_OP_SWIGLU, GGML_GLU_OP_GEGLU}) { + test_cases.emplace_back(new test_fused_ffn_gate(type, glu_op, 1, 32, 256, use_id, 16, 8, b)); + } + } + } + } + for (bool with_norm : {false, true}) { test_cases.emplace_back(new test_topk_moe({8, 22, 1, 1}, 4, with_norm)); test_cases.emplace_back(new test_topk_moe({32, 22, 1, 1}, 8, with_norm));