@@ -348,6 +348,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
348348// Copy destination pointers to GPU to be available when pointer indirection is in use
349349
350350void ggml_backend_dest_ptrs_copy (ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size) {
351+ #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
351352 if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
352353 if (cuda_graph->dest_ptrs_d != nullptr ) cudaFree (cuda_graph->dest_ptrs_d );
353354 cudaMalloc (&cuda_graph->dest_ptrs_d , host_dest_ptrs_size*sizeof (char *));
@@ -356,6 +357,7 @@ void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest
356357 // copy destination pointers to GPU
357358 cudaMemcpy (cuda_graph->dest_ptrs_d , host_dest_ptrs, host_dest_ptrs_size*sizeof (char *), cudaMemcpyHostToDevice);
358359 cuda_graph->graph_cpynode_index = 0 ; // reset index
360+ #endif
359361}
360362
361363static void ggml_cpy_f16_f32_cuda (
@@ -560,46 +562,55 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
560562 char * src0_ddc = (char *) src0->data ;
561563 char * src1_ddc = (char *) src1->data ;
562564
565+ char ** dest_ptrs_d = nullptr ;
566+ int graph_cpynode_index = -1 ;
567+ #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
568+ dest_ptrs_d = ctx.cuda_graph ->dest_ptrs_d ;
569+ graph_cpynode_index = ctx.cuda_graph ->graph_cpynode_index ;
570+ #endif
563571 if (src0->type == src1->type && ggml_is_contiguous (src0) && ggml_is_contiguous (src1)) {
564572 GGML_ASSERT (ggml_nbytes (src0) == ggml_nbytes (src1));
565573 CUDA_CHECK (cudaMemcpyAsync (src1_ddc, src0_ddc, ggml_nbytes (src0), cudaMemcpyDeviceToDevice, main_stream));
566574 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
567- ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
575+ ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
568576 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
569- ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
577+ ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
570578 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
571- ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
579+ ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
572580 } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
573- ggml_cpy_q8_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
581+ ggml_cpy_q8_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
574582 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
575- ggml_cpy_f32_q4_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
583+ ggml_cpy_f32_q4_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
576584 } else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
577585 ggml_cpy_q4_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
578- nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
586+ nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
579587 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
580- ggml_cpy_f32_q4_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
588+ ggml_cpy_f32_q4_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
581589 } else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
582590 ggml_cpy_q4_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
583- nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
591+ nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
584592 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
585- ggml_cpy_f32_q5_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
593+ ggml_cpy_f32_q5_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
586594 } else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
587595 ggml_cpy_q5_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
588- nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
596+ nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
589597 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
590- ggml_cpy_f32_iq4_nl_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
598+ ggml_cpy_f32_iq4_nl_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
591599 } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
592- ggml_cpy_f32_q5_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
600+ ggml_cpy_f32_q5_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
593601 } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
594- ggml_cpy_q5_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
602+ ggml_cpy_q5_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
595603 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
596- ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
604+ ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
597605 } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
598- ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
606+ ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
599607 } else {
600608 GGML_ABORT (" %s: unsupported type combination (%s to %s)\n " , __func__,
601609 ggml_type_name (src0->type ), ggml_type_name (src1->type ));
602610 }
611+ #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
612+ ctx.cuda_graph ->graph_cpynode_index = graph_cpynode_index;
613+ #endif
603614
604615}
605616
0 commit comments