Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions ggml/src/ggml-opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -82,9 +82,13 @@ set(GGML_OPENCL_KERNELS
mul_mv_q4_0_f32_1d_8x_flat
mul_mv_q4_0_f32_1d_16x_flat
mul_mv_q6_k
mul_mv_q8_0_f32
mul_mv_q8_0_f32_flat
mul_mv_mxfp4_f32
mul_mv_mxfp4_f32_flat
mul_mv_id_q4_0_f32_8x_flat
mul_mv_id_q8_0_f32
mul_mv_id_q8_0_f32_flat
mul_mv_id_mxfp4_f32
mul_mv_id_mxfp4_f32_flat
mul_mm_f32_f32_l4_lm
Expand Down
389 changes: 382 additions & 7 deletions ggml/src/ggml-opencl/ggml-opencl.cpp

Large diffs are not rendered by default.

42 changes: 40 additions & 2 deletions ggml/src/ggml-opencl/kernels/cvt.cl
Original file line number Diff line number Diff line change
Expand Up @@ -117,9 +117,8 @@ kernel void kernel_convert_block_q4_0_noshuffle(
}
}


//------------------------------------------------------------------------------
// block_q4_0
// block_mxfp4
//------------------------------------------------------------------------------
#define QK_MXFP4 32
struct block_mxfp4 {
Expand Down Expand Up @@ -162,3 +161,42 @@ kernel void kernel_restore_block_mxfp4(
b->qs[i] = q[i];
}
}

//------------------------------------------------------------------------------
// block_q8_0
//------------------------------------------------------------------------------
typedef struct {
half d; // delta
char qs[QK8_0]; // quants
} block_q8_0;

kernel void kernel_convert_block_q8_0(
global block_q8_0 * src0,
global uchar * dst_q,
global half * dst_d
) {
global block_q8_0 * b = (global block_q8_0 *) src0 + get_global_id(0);
global uchar * q = (global uchar *) dst_q + QK8_0*get_global_id(0);
global half * d = (global half *) dst_d + get_global_id(0);

*d = b->d;

for (int i = 0; i < QK8_0; ++i) {
q[i] = b->qs[i];
}
}

kernel void kernel_restore_block_q8_0(
global uchar * src_q,
global half * src_d,
global block_q8_0 * dst
) {
global block_q8_0 * b = (global block_q8_0 *) dst + get_global_id(0);
global uchar * q = (global uchar *) src_q + QK8_0*get_global_id(0);
global half * d = (global half *) src_d + get_global_id(0);

b->d = *d;
for (int i = 0; i < QK8_0; ++i) {
b->qs[i] = q[i];
}
}
140 changes: 140 additions & 0 deletions ggml/src/ggml-opencl/kernels/mul_mv_id_q8_0_f32.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable

#ifdef cl_intel_subgroups
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
#else
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
#endif

#ifdef cl_intel_required_subgroup_size
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
#define INTEL_GPU 1
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
#elif defined(cl_qcom_reqd_sub_group_size)
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
#define ADRENO_GPU 1
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
#endif

#define QK8_0 32
typedef struct {
half d; // delta
char qs[QK8_0]; // quants
} block_q8_0;

#define NB_Q8_0 8

#ifdef INTEL_GPU
#define N_R0_Q8_0 4 // number of rows each subgroup works on
#define N_SG_Q8_0 2 // number of subgroups in a work group
#define N_SIMDWIDTH 16 // subgroup size
#elif defined (ADRENO_GPU)
#define N_R0_Q8_0 4
#define N_SG_Q8_0 2
#define N_SIMDWIDTH 64
#endif

#ifdef INTEL_GPU
REQD_SUBGROUP_SIZE_16
#elif defined (ADRENO_GPU)
REQD_SUBGROUP_SIZE_64
#endif
kernel void kernel_mul_mv_id_q8_0_f32(
global char * src0,
ulong offset0,
global char * src1,
ulong offset1,
global char * src2,
ulong offset2,
global char * dst,
ulong offsetd,
int ne00,
int ne01,
ulong nb01,
ulong nb02,
int ne11,
int ne12,
ulong nb11,
ulong nb12,
int ne20,
int ne21,
ulong nb21,
int ne0,
int ne1
) {
src0 = (global char *)((global char *)src0 + offset0);
src1 = (global char *)((global char *)src1 + offset1);
src2 = (global char *)((global char *)src2 + offset2);
dst = (global char *)((global char *)dst + offsetd);

int iid1 = get_group_id(2)/ne20;
int idx = get_group_id(2)%ne20;

int i02 = ((global int *) (src2 + iid1*nb21))[idx];

int i11_ = idx % ne11;
int i12_ = iid1;

int i1 = idx;
int i2 = i12_;

global char * src0_cur = src0 + i02*nb02;
global char * src1_cur = src1 + i11_*nb11 + i12_*nb12;

global char * dst_cur = dst + (i1*ne0 + i2*ne1*ne0)*sizeof(float);

int nb = ne00/QK8_0;

int r0 = get_group_id(0);
int r1 = get_group_id(1);

int first_row = (r0*N_SG_Q8_0 + get_sub_group_id()) * N_R0_Q8_0;

ulong offset_src1 = r1*nb11;
global float * y = (global float *) (src1_cur + offset_src1);

// pointers to src0 rows
global block_q8_0 * ax[N_R0_Q8_0];
for (int row = 0; row < N_R0_Q8_0; ++row) {
ulong offset_src0 = (first_row + row)*nb01;
ax[row] = (global block_q8_0 *) ((global char *) src0_cur + offset_src0);
}

float yl[NB_Q8_0];
float sumf[N_R0_Q8_0] = { 0.f };

const short ix = get_sub_group_local_id()/4;
const short il = get_sub_group_local_id()%4;

global float * yb = y + ix*QK8_0 + il*NB_Q8_0;

// each thread handles NB_Q8_0 quants at a time
for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/4) {
for (short i = 0; i < NB_Q8_0; ++i) {
yl[i] = yb[i];
}

for (short row = 0; row < N_R0_Q8_0; row++) {
global char * qs = ax[row][ib].qs + il*NB_Q8_0;
float sumq = 0.f;
for (short iq = 0; iq < NB_Q8_0; ++iq) {
sumq += qs[iq] * yl[iq];
}
sumf[row] += sumq*ax[row][ib].d;
}

yb += N_SIMDWIDTH*NB_Q8_0;
}

global float * dst_f32 = (global float *) dst_cur + (ulong)r1*ne0;

for (int row = 0; row < N_R0_Q8_0; ++row) {
float tot = sub_group_reduce_add(sumf[row]);

if (get_sub_group_local_id() == 0 && first_row + row < ne01) {
dst_f32[first_row + row] = tot;
}
}
}
Loading
Loading