@@ -347,26 +347,21 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
347347
348348//  Copy destination pointers to GPU to be available when pointer indirection is in use
349349
350- static  char  ** dest_ptrs;
351- static  int  dest_ptrs_size=0 ;
352- //  Index to allow each cpy kernel to be aware of it's position within the graph relative to other cpy nodes.
353- static  int  graph_cpynode_index = -1 ; 
354- 
355- void  ggml_backend_dest_ptrs_copy (char  ** host_dest_ptrs, const  int  host_dest_ptrs_size) {
356-     if (dest_ptrs_size < host_dest_ptrs_size) { //  (re-)allocate GPU memory for destination pointers
357-         if  (dest_ptrs != nullptr ) cudaFree (dest_ptrs);
358-         cudaMalloc (&dest_ptrs, host_dest_ptrs_size*sizeof (char  *));
359-         dest_ptrs_size = host_dest_ptrs_size;
350+ void  ggml_backend_dest_ptrs_copy (ggml_cuda_graph * cuda_graph, char  ** host_dest_ptrs, const  int  host_dest_ptrs_size) {
351+     if (cuda_graph->dest_ptrs_size  < host_dest_ptrs_size) { //  (re-)allocate GPU memory for destination pointers
352+         if  (cuda_graph->dest_ptrs_d  != nullptr ) cudaFree (cuda_graph->dest_ptrs_d );
353+         cudaMalloc (&cuda_graph->dest_ptrs_d , host_dest_ptrs_size*sizeof (char  *));
354+         cuda_graph->dest_ptrs_size  = host_dest_ptrs_size;
360355    }
361356    //  copy destination pointers to GPU
362-     cudaMemcpy (dest_ptrs , host_dest_ptrs, host_dest_ptrs_size*sizeof (char  *), cudaMemcpyHostToDevice);
363-     graph_cpynode_index = 0 ; //  reset index
357+     cudaMemcpy (cuda_graph-> dest_ptrs_d , host_dest_ptrs, host_dest_ptrs_size*sizeof (char  *), cudaMemcpyHostToDevice);
358+     cuda_graph-> graph_cpynode_index  = 0 ; //  reset index
364359}
365360
366361static  void  ggml_cpy_f16_f32_cuda (
367362    const  char  * cx, char  * cdst, const  int  ne,
368363    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
369-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
364+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
370365
371366    const  int  num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
372367    cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>> 
@@ -376,7 +371,7 @@ static void ggml_cpy_f16_f32_cuda(
376371static  void  ggml_cpy_f32_f32_cuda (
377372    const  char  * cx, char  * cdst, const  int  ne,
378373    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
379-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
374+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
380375
381376    const  int  num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
382377    cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>> 
@@ -386,7 +381,7 @@ static void ggml_cpy_f32_f32_cuda(
386381static  void  ggml_cpy_f32_f16_cuda (
387382    const  char  * cx, char  * cdst, const  int  ne,
388383    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
389-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
384+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
390385
391386    const  int  num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
392387    cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>> 
@@ -396,7 +391,7 @@ static void ggml_cpy_f32_f16_cuda(
396391static  void  ggml_cpy_f32_q8_0_cuda (
397392    const  char  * cx, char  * cdst, const  int  ne,
398393    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
399-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
394+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
400395
401396    GGML_ASSERT (ne % QK8_0 == 0 );
402397    const  int  num_blocks = ne / QK8_0;
@@ -407,7 +402,7 @@ static void ggml_cpy_f32_q8_0_cuda(
407402static  void  ggml_cpy_q8_0_f32_cuda (
408403    const  char  * cx, char  * cdst, const  int  ne,
409404    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
410-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
405+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
411406
412407    const  int  num_blocks = ne;
413408    cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1 , 0 , stream>>> 
@@ -417,7 +412,7 @@ static void ggml_cpy_q8_0_f32_cuda(
417412static  void  ggml_cpy_f32_q4_0_cuda (
418413    const  char  * cx, char  * cdst, const  int  ne,
419414    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
420-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
415+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
421416
422417    GGML_ASSERT (ne % QK4_0 == 0 );
423418    const  int  num_blocks = ne / QK4_0;
@@ -431,7 +426,7 @@ static void ggml_cpy_q4_0_f32_cuda(
431426    const  int  nb00, const  int  nb01, const  int  nb02,
432427    const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12,
433428    const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13,
434-     cudaStream_t stream, char  ** cdst_indirect) {
429+     cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
435430    const  int  num_blocks = ne;
436431    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1 , 0 , stream>>> (
437432        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -441,7 +436,7 @@ static void ggml_cpy_q4_0_f32_cuda(
441436static  void  ggml_cpy_f32_q4_1_cuda (
442437    const  char  * cx, char  * cdst, const  int  ne,
443438    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
444-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
439+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
445440
446441    GGML_ASSERT (ne % QK4_1 == 0 );
447442    const  int  num_blocks = ne / QK4_1;
@@ -455,7 +450,7 @@ static void ggml_cpy_q4_1_f32_cuda(
455450    const  int  nb00, const  int  nb01, const  int  nb02,
456451    const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12,
457452    const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13,
458-     cudaStream_t stream, char  ** cdst_indirect) {
453+     cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
459454    const  int  num_blocks = ne;
460455    cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1 , 0 , stream>>> (
461456        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -465,7 +460,7 @@ static void ggml_cpy_q4_1_f32_cuda(
465460static  void  ggml_cpy_f32_q5_0_cuda (
466461    const  char  * cx, char  * cdst, const  int  ne,
467462    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
468-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
463+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
469464
470465    GGML_ASSERT (ne % QK5_0 == 0 );
471466    const  int  num_blocks = ne / QK5_0;
@@ -479,7 +474,7 @@ static void ggml_cpy_q5_0_f32_cuda(
479474    const  int  nb00, const  int  nb01, const  int  nb02,
480475    const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12,
481476    const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13,
482-     cudaStream_t stream, char  ** cdst_indirect) {
477+     cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
483478    const  int  num_blocks = ne;
484479    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1 , 0 , stream>>> (
485480        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -489,7 +484,7 @@ static void ggml_cpy_q5_0_f32_cuda(
489484static  void  ggml_cpy_f32_q5_1_cuda (
490485    const  char  * cx, char  * cdst, const  int  ne,
491486    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
492-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
487+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
493488
494489    GGML_ASSERT (ne % QK5_1 == 0 );
495490    const  int  num_blocks = ne / QK5_1;
@@ -503,7 +498,7 @@ static void ggml_cpy_q5_1_f32_cuda(
503498    const  int  nb00, const  int  nb01, const  int  nb02,
504499    const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12,
505500    const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13,
506-     cudaStream_t stream, char  ** cdst_indirect) {
501+     cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
507502    const  int  num_blocks = ne;
508503    cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1 , 0 , stream>>> (
509504        cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -513,7 +508,7 @@ static void ggml_cpy_q5_1_f32_cuda(
513508static  void  ggml_cpy_f32_iq4_nl_cuda (
514509    const  char  * cx, char  * cdst, const  int  ne,
515510    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
516-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
511+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
517512
518513    GGML_ASSERT (ne % QK4_NL == 0 );
519514    const  int  num_blocks = ne / QK4_NL;
@@ -524,7 +519,7 @@ static void ggml_cpy_f32_iq4_nl_cuda(
524519static  void  ggml_cpy_f16_f16_cuda (
525520    const  char  * cx, char  * cdst, const  int  ne,
526521    const  int  ne00, const  int  ne01, const  int  ne02, const  int  nb00, const  int  nb01, const  int  nb02,
527-     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect) {
522+     const  int  nb03, const  int  ne10, const  int  ne11, const  int  ne12, const  int  nb10, const  int  nb11, const  int  nb12, const  int  nb13, cudaStream_t stream, char  ** cdst_indirect,  int  & graph_cpynode_index ) {
528523
529524    const  int  num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1 ) / CUDA_CPY_BLOCK_SIZE;
530525    cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0 , stream>>> 
@@ -569,38 +564,38 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
569564        GGML_ASSERT (ggml_nbytes (src0) == ggml_nbytes (src1));
570565        CUDA_CHECK (cudaMemcpyAsync (src1_ddc, src0_ddc, ggml_nbytes (src0), cudaMemcpyDeviceToDevice, main_stream));
571566    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_F32) {
572-         ggml_cpy_f32_f32_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
567+         ggml_cpy_f32_f32_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
573568    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_F16) {
574-         ggml_cpy_f32_f16_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
569+         ggml_cpy_f32_f16_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
575570    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_Q8_0) {
576-         ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
571+         ggml_cpy_f32_q8_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
577572    } else  if  (src0->type  == GGML_TYPE_Q8_0 && src1->type  == GGML_TYPE_F32) {
578-         ggml_cpy_q8_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
573+         ggml_cpy_q8_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
579574    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_Q4_0) {
580-         ggml_cpy_f32_q4_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
575+         ggml_cpy_f32_q4_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
581576    } else  if  (src0->type  == GGML_TYPE_Q4_0 && src1->type  == GGML_TYPE_F32) {
582577        ggml_cpy_q4_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
583-             nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
578+             nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
584579    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_Q4_1) {
585-         ggml_cpy_f32_q4_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
580+         ggml_cpy_f32_q4_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
586581    } else  if  (src0->type  == GGML_TYPE_Q4_1 && src1->type  == GGML_TYPE_F32) {
587582        ggml_cpy_q4_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
588-             nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
583+             nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
589584    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_Q5_0) {
590-         ggml_cpy_f32_q5_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
585+         ggml_cpy_f32_q5_0_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
591586    } else  if  (src0->type  == GGML_TYPE_Q5_0 && src1->type  == GGML_TYPE_F32) {
592587        ggml_cpy_q5_0_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
593-             nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
588+             nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
594589    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_IQ4_NL) {
595-         ggml_cpy_f32_iq4_nl_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
590+         ggml_cpy_f32_iq4_nl_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
596591    } else  if  (src0->type  == GGML_TYPE_F32 && src1->type  == GGML_TYPE_Q5_1) {
597-         ggml_cpy_f32_q5_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
592+         ggml_cpy_f32_q5_1_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
598593    } else  if  (src0->type  == GGML_TYPE_Q5_1 && src1->type  == GGML_TYPE_F32) {
599-         ggml_cpy_q5_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
594+         ggml_cpy_q5_1_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
600595    } else  if  (src0->type  == GGML_TYPE_F16 && src1->type  == GGML_TYPE_F16) {
601-         ggml_cpy_f16_f16_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
596+         ggml_cpy_f16_f16_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
602597    } else  if  (src0->type  == GGML_TYPE_F16 && src1->type  == GGML_TYPE_F32) {
603-         ggml_cpy_f16_f32_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs );
598+         ggml_cpy_f16_f32_cuda  (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx. cuda_graph -> dest_ptrs_d , ctx. cuda_graph -> graph_cpynode_index );
604599    } else  {
605600        GGML_ABORT (" %s: unsupported type combination (%s to %s)\n "  , __func__,
606601                ggml_type_name (src0->type ), ggml_type_name (src1->type ));
0 commit comments