@@ -10,12 +10,12 @@ static __device__ void cpy_1_f32_f32(const char * cxi, char * cdsti) {
1010 *dsti = *xi;
1111}
1212
13- static __device__ void cpy_1_f32_bf16 (const char * cxi, char * cdsti) {
14- const float * xi = (const float *) cxi;
15- nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
13+ // static __device__ void cpy_1_f32_bf16(const char * cxi, char * cdsti) {
14+ // const float * xi = (const float *) cxi;
15+ // nv_bfloat16 * dsti = (nv_bfloat16 *) cdsti;
1616
17- *dsti = *xi;
18- }
17+ // *dsti = *xi;
18+ // }
1919
2020static __device__ void cpy_1_f32_f16 (const char * cxi, char * cdsti) {
2121 const float * xi = (const float *) cxi;
@@ -463,15 +463,15 @@ static void ggml_cpy_f32_f16_cuda(
463463 (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
464464}
465465
466- static void ggml_cpy_f32_bf16_cuda (
467- const char * cx, char * cdst, const int ne,
468- const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
469- const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
466+ // static void ggml_cpy_f32_bf16_cuda(
467+ // const char * cx, char * cdst, const int ne,
468+ // const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
469+ // const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
470470
471- const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
472- cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>>
473- (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
474- }
471+ // const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
472+ // cpy_f32_f16<cpy_1_f32_bf16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
473+ // (cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, cdst_indirect, graph_cpynode_index++);
474+ // }
475475
476476static void ggml_cpy_f32_q8_0_cuda (
477477 const char * cx, char * cdst, const int ne,
@@ -716,8 +716,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
716716 ggml_cpy_f32_q6_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
717717 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
718718 ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
719- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
720- ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
719+ // } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
720+ // ggml_cpy_f32_bf16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
721721 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
722722 ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
723723 } else {
@@ -746,8 +746,8 @@ void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
746746 return (void *) cpy_f32_f16<cpy_1_f32_bf16>;
747747 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
748748 return (void *) cpy_f32_f16<cpy_1_f32_f16>;
749- } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
750- return (void *) cpy_f32_f16<cpy_1_f32_bf16>;
749+ // } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {
750+ // return (void*) cpy_f32_f16<cpy_1_f32_bf16>;
751751 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
752752 return (void *) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
753753 } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
0 commit comments