@@ -81,6 +81,17 @@ static __device__ void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
8181 }
8282}
8383
84+ static __device__ void cpy_blck_q8_0_f32 (const char * cxi, char * cdsti) {
85+ const block_q8_0 * xi = (const block_q8_0 *) cxi;
86+ float * dsti = (float *) cdsti;
87+
88+ const float d = (float )xi->d ;
89+
90+ for (int j = 0 ; j < QK8_0; j++) {
91+ dsti[j] = xi->qs [j] * d;
92+ }
93+ }
94+
8495static __device__ void cpy_blck_f32_q4_0 (const char * cxi, char * cdsti) {
8596 const float * xi = (const float *) cxi;
8697 block_q4_0 * dsti = (block_q4_0 *) cdsti;
@@ -288,6 +299,32 @@ static __global__ void cpy_f32_q(const char * cx, char * cdst, const int ne,
288299 cpy_blck (cx + x_offset, cdst + dst_offset);
289300}
290301
302+ template <cpy_kernel_t cpy_blck, int qk>
303+ static __global__ void cpy_q_f32 (const char * cx, char * cdst, const int ne,
304+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
305+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11,
306+ const int nb12, const int nb13) {
307+ const int i = (blockDim .x *blockIdx .x + threadIdx .x )*qk;
308+
309+ if (i >= ne) {
310+ return ;
311+ }
312+
313+ const int i03 = i/(ne00 * ne01 * ne02);
314+ const int i02 = (i - i03*ne00*ne01*ne02 )/ (ne00*ne01);
315+ const int i01 = (i - i03*ne00*ne01*ne02 - i02*ne01*ne00) / ne00;
316+ const int i00 = i - i03*ne00*ne01*ne02 - i02*ne01*ne00 - i01*ne00;
317+ const int x_offset = (i00/qk)*nb00 + i01*nb01 + i02*nb02 + i03 * nb03;
318+
319+ const int i13 = i/(ne10 * ne11 * ne12);
320+ const int i12 = (i - i13*ne10*ne11*ne12) / (ne10*ne11);
321+ const int i11 = (i - i13*ne10*ne11*ne12 - i12*ne10*ne11) / ne10;
322+ const int i10 = i - i13*ne10*ne11*ne12 - i12*ne10*ne11 - i11*ne10;
323+ const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13;
324+
325+ cpy_blck (cx + x_offset, cdst + dst_offset);
326+ }
327+
291328static void ggml_cpy_f16_f32_cuda (
292329 const char * cx, char * cdst, const int ne,
293330 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -329,6 +366,16 @@ static void ggml_cpy_f32_q8_0_cuda(
329366 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
330367}
331368
369+ static void ggml_cpy_q8_0_f32_cuda (
370+ const char * cx, char * cdst, const int ne,
371+ const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
372+ const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream) {
373+
374+ const int num_blocks = ne;
375+ cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1 , 0 , stream>>>
376+ (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13);
377+ }
378+
332379static void ggml_cpy_f32_q4_0_cuda (
333380 const char * cx, char * cdst, const int ne,
334381 const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
@@ -437,6 +484,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
437484 ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
438485 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
439486 ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
487+ } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
488+ ggml_cpy_q8_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
440489 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
441490 ggml_cpy_f32_q4_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
442491 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
@@ -471,6 +520,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
471520 return (void *) cpy_f32_f16<cpy_1_f32_f16>;
472521 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
473522 return (void *) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
523+ } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
524+ return (void *) cpy_q_f32<cpy_blck_q8_0_f32, QK8_0>;
474525 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
475526 return (void *) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
476527 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
0 commit comments