|
| 1 | +#pragma OPENCL EXTENSION cl_khr_fp16 : enable |
| 2 | + |
| 3 | +#ifdef cl_intel_subgroups |
| 4 | +#pragma OPENCL EXTENSION cl_intel_subgroups : enable |
| 5 | +#else |
| 6 | +#pragma OPENCL EXTENSION cl_khr_subgroups : enable |
| 7 | +#endif |
| 8 | + |
| 9 | +#ifdef cl_intel_required_subgroup_size |
| 10 | +#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable |
| 11 | +#define INTEL_GPU 1 |
| 12 | +#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16))) |
| 13 | +#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32))) |
| 14 | +#elif defined(cl_qcom_reqd_sub_group_size) |
| 15 | +#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable |
| 16 | +#define ADRENO_GPU 1 |
| 17 | +#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half"))) |
| 18 | +#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full"))) |
| 19 | +#endif |
| 20 | + |
| 21 | +#define QK8_0 32 |
| 22 | +typedef struct { |
| 23 | + half d; // delta |
| 24 | + char qs[QK8_0]; // quants |
| 25 | +} block_q8_0; |
| 26 | + |
| 27 | +#define NB_Q8_0 8 |
| 28 | + |
| 29 | +#ifdef INTEL_GPU |
| 30 | +#define N_R0_Q8_0 4 // number of rows each subgroup works on |
| 31 | +#define N_SG_Q8_0 2 // number of subgroups in a work group |
| 32 | +#define N_SIMDWIDTH 16 // subgroup size |
| 33 | +#elif defined (ADRENO_GPU) |
| 34 | +#define N_R0_Q8_0 4 |
| 35 | +#define N_SG_Q8_0 2 |
| 36 | +#define N_SIMDWIDTH 64 |
| 37 | +#endif |
| 38 | + |
| 39 | +#ifdef INTEL_GPU |
| 40 | +REQD_SUBGROUP_SIZE_16 |
| 41 | +#elif defined (ADRENO_GPU) |
| 42 | +REQD_SUBGROUP_SIZE_64 |
| 43 | +#endif |
| 44 | +kernel void kernel_mul_mv_id_q8_0_f32( |
| 45 | + global char * src0, |
| 46 | + ulong offset0, |
| 47 | + global char * src1, |
| 48 | + ulong offset1, |
| 49 | + global char * src2, |
| 50 | + ulong offset2, |
| 51 | + global char * dst, |
| 52 | + ulong offsetd, |
| 53 | + int ne00, |
| 54 | + int ne01, |
| 55 | + ulong nb01, |
| 56 | + ulong nb02, |
| 57 | + int ne11, |
| 58 | + int ne12, |
| 59 | + ulong nb11, |
| 60 | + ulong nb12, |
| 61 | + int ne20, |
| 62 | + int ne21, |
| 63 | + ulong nb21, |
| 64 | + int ne0, |
| 65 | + int ne1 |
| 66 | +) { |
| 67 | + src0 = (global char *)((global char *)src0 + offset0); |
| 68 | + src1 = (global char *)((global char *)src1 + offset1); |
| 69 | + src2 = (global char *)((global char *)src2 + offset2); |
| 70 | + dst = (global char *)((global char *)dst + offsetd); |
| 71 | + |
| 72 | + int iid1 = get_group_id(2)/ne20; |
| 73 | + int idx = get_group_id(2)%ne20; |
| 74 | + |
| 75 | + int i02 = ((global int *) (src2 + iid1*nb21))[idx]; |
| 76 | + |
| 77 | + int i11_ = idx % ne11; |
| 78 | + int i12_ = iid1; |
| 79 | + |
| 80 | + int i1 = idx; |
| 81 | + int i2 = i12_; |
| 82 | + |
| 83 | + global char * src0_cur = src0 + i02*nb02; |
| 84 | + global char * src1_cur = src1 + i11_*nb11 + i12_*nb12; |
| 85 | + |
| 86 | + global char * dst_cur = dst + (i1*ne0 + i2*ne1*ne0)*sizeof(float); |
| 87 | + |
| 88 | + int nb = ne00/QK8_0; |
| 89 | + |
| 90 | + int r0 = get_group_id(0); |
| 91 | + int r1 = get_group_id(1); |
| 92 | + |
| 93 | + int first_row = (r0*N_SG_Q8_0 + get_sub_group_id()) * N_R0_Q8_0; |
| 94 | + |
| 95 | + ulong offset_src1 = r1*nb11; |
| 96 | + global float * y = (global float *) (src1_cur + offset_src1); |
| 97 | + |
| 98 | + // pointers to src0 rows |
| 99 | + global block_q8_0 * ax[N_R0_Q8_0]; |
| 100 | + for (int row = 0; row < N_R0_Q8_0; ++row) { |
| 101 | + ulong offset_src0 = (first_row + row)*nb01; |
| 102 | + ax[row] = (global block_q8_0 *) ((global char *) src0_cur + offset_src0); |
| 103 | + } |
| 104 | + |
| 105 | + float yl[NB_Q8_0]; |
| 106 | + float sumf[N_R0_Q8_0] = { 0.f }; |
| 107 | + |
| 108 | + const short ix = get_sub_group_local_id()/4; |
| 109 | + const short il = get_sub_group_local_id()%4; |
| 110 | + |
| 111 | + global float * yb = y + ix*QK8_0 + il*NB_Q8_0; |
| 112 | + |
| 113 | + // each thread handles NB_Q8_0 quants at a time |
| 114 | + for (int ib = ix; ib < nb; ib += N_SIMDWIDTH/4) { |
| 115 | + for (short i = 0; i < NB_Q8_0; ++i) { |
| 116 | + yl[i] = yb[i]; |
| 117 | + } |
| 118 | + |
| 119 | + for (short row = 0; row < N_R0_Q8_0; row++) { |
| 120 | + global char * qs = ax[row][ib].qs + il*NB_Q8_0; |
| 121 | + float sumq = 0.f; |
| 122 | + for (short iq = 0; iq < NB_Q8_0; ++iq) { |
| 123 | + sumq += qs[iq] * yl[iq]; |
| 124 | + } |
| 125 | + sumf[row] += sumq*ax[row][ib].d; |
| 126 | + } |
| 127 | + |
| 128 | + yb += N_SIMDWIDTH*NB_Q8_0; |
| 129 | + } |
| 130 | + |
| 131 | + global float * dst_f32 = (global float *) dst_cur + (ulong)r1*ne0; |
| 132 | + |
| 133 | + for (int row = 0; row < N_R0_Q8_0; ++row) { |
| 134 | + float tot = sub_group_reduce_add(sumf[row]); |
| 135 | + |
| 136 | + if (get_sub_group_local_id() == 0 && first_row + row < ne01) { |
| 137 | + dst_f32[first_row + row] = tot; |
| 138 | + } |
| 139 | + } |
| 140 | +} |
0 commit comments