Skip to content

Commit 1a2441a

Browse files
committed
Addressed comments
1 parent e9a1be0 commit 1a2441a

File tree

5 files changed

+45
-46
lines changed

5 files changed

+45
-46
lines changed

ggml/include/ggml-backend.h

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -349,9 +349,6 @@ extern "C" {
349349
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
350350
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
351351

352-
// Copy destination pointers for copy operations pointers to backend
353-
GGML_API void ggml_backend_dest_ptrs_copy(char ** host_dest_ptrs, const int host_dest_ptrs_size);
354-
355352
#ifdef __cplusplus
356353
}
357354
#endif

ggml/src/ggml-cuda/common.cuh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,11 @@ struct ggml_cuda_graph {
712712
int number_consecutive_updates = 0;
713713
std::vector<ggml_graph_node_properties> ggml_graph_properties;
714714
std::vector<char *> cpy_dest_ptrs;
715+
char ** dest_ptrs_d;
716+
int dest_ptrs_size = 0;
717+
// Index to allow each cpy kernel to be aware of it's position within the graph
718+
// relative to other cpy nodes.
719+
int graph_cpynode_index = -1;
715720
#endif
716721
};
717722

ggml/src/ggml-cuda/cpy.cu

Lines changed: 37 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -347,26 +347,21 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
347347

348348
// Copy destination pointers to GPU to be available when pointer indirection is in use
349349

350-
static char ** dest_ptrs;
351-
static int dest_ptrs_size=0;
352-
// Index to allow each cpy kernel to be aware of it's position within the graph relative to other cpy nodes.
353-
static int graph_cpynode_index = -1;
354-
355-
void ggml_backend_dest_ptrs_copy(char ** host_dest_ptrs, const int host_dest_ptrs_size) {
356-
if(dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
357-
if (dest_ptrs != nullptr) cudaFree(dest_ptrs);
358-
cudaMalloc(&dest_ptrs, host_dest_ptrs_size*sizeof(char *));
359-
dest_ptrs_size = host_dest_ptrs_size;
350+
void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size) {
351+
if(cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
352+
if (cuda_graph->dest_ptrs_d != nullptr) cudaFree(cuda_graph->dest_ptrs_d);
353+
cudaMalloc(&cuda_graph->dest_ptrs_d, host_dest_ptrs_size*sizeof(char *));
354+
cuda_graph->dest_ptrs_size = host_dest_ptrs_size;
360355
}
361356
// copy destination pointers to GPU
362-
cudaMemcpy(dest_ptrs, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice);
363-
graph_cpynode_index = 0; // reset index
357+
cudaMemcpy(cuda_graph->dest_ptrs_d, host_dest_ptrs, host_dest_ptrs_size*sizeof(char *), cudaMemcpyHostToDevice);
358+
cuda_graph->graph_cpynode_index = 0; // reset index
364359
}
365360

366361
static void ggml_cpy_f16_f32_cuda(
367362
const char * cx, char * cdst, const int ne,
368363
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
369-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
364+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
370365

371366
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
372367
cpy_f32_f16<cpy_1_f16_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
@@ -376,7 +371,7 @@ static void ggml_cpy_f16_f32_cuda(
376371
static void ggml_cpy_f32_f32_cuda(
377372
const char * cx, char * cdst, const int ne,
378373
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
379-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
374+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
380375

381376
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
382377
cpy_f32_f16<cpy_1_f32_f32><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
@@ -386,7 +381,7 @@ static void ggml_cpy_f32_f32_cuda(
386381
static void ggml_cpy_f32_f16_cuda(
387382
const char * cx, char * cdst, const int ne,
388383
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
389-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
384+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
390385

391386
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
392387
cpy_f32_f16<cpy_1_f32_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
@@ -396,7 +391,7 @@ static void ggml_cpy_f32_f16_cuda(
396391
static void ggml_cpy_f32_q8_0_cuda(
397392
const char * cx, char * cdst, const int ne,
398393
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
399-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
394+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
400395

401396
GGML_ASSERT(ne % QK8_0 == 0);
402397
const int num_blocks = ne / QK8_0;
@@ -407,7 +402,7 @@ static void ggml_cpy_f32_q8_0_cuda(
407402
static void ggml_cpy_q8_0_f32_cuda(
408403
const char * cx, char * cdst, const int ne,
409404
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
410-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
405+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
411406

412407
const int num_blocks = ne;
413408
cpy_q_f32<cpy_blck_q8_0_f32, QK8_0><<<num_blocks, 1, 0, stream>>>
@@ -417,7 +412,7 @@ static void ggml_cpy_q8_0_f32_cuda(
417412
static void ggml_cpy_f32_q4_0_cuda(
418413
const char * cx, char * cdst, const int ne,
419414
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
420-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
415+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
421416

422417
GGML_ASSERT(ne % QK4_0 == 0);
423418
const int num_blocks = ne / QK4_0;
@@ -431,7 +426,7 @@ static void ggml_cpy_q4_0_f32_cuda(
431426
const int nb00, const int nb01, const int nb02,
432427
const int nb03, const int ne10, const int ne11, const int ne12,
433428
const int nb10, const int nb11, const int nb12, const int nb13,
434-
cudaStream_t stream, char ** cdst_indirect) {
429+
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
435430
const int num_blocks = ne;
436431
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_0, QK4_0>, QK4_0><<<num_blocks, 1, 0, stream>>>(
437432
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -441,7 +436,7 @@ static void ggml_cpy_q4_0_f32_cuda(
441436
static void ggml_cpy_f32_q4_1_cuda(
442437
const char * cx, char * cdst, const int ne,
443438
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
444-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
439+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
445440

446441
GGML_ASSERT(ne % QK4_1 == 0);
447442
const int num_blocks = ne / QK4_1;
@@ -455,7 +450,7 @@ static void ggml_cpy_q4_1_f32_cuda(
455450
const int nb00, const int nb01, const int nb02,
456451
const int nb03, const int ne10, const int ne11, const int ne12,
457452
const int nb10, const int nb11, const int nb12, const int nb13,
458-
cudaStream_t stream, char ** cdst_indirect) {
453+
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
459454
const int num_blocks = ne;
460455
cpy_q_f32<cpy_blck_q_f32<dequantize_q4_1, QK4_1>, QK4_1><<<num_blocks, 1, 0, stream>>>(
461456
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -465,7 +460,7 @@ static void ggml_cpy_q4_1_f32_cuda(
465460
static void ggml_cpy_f32_q5_0_cuda(
466461
const char * cx, char * cdst, const int ne,
467462
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
468-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
463+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
469464

470465
GGML_ASSERT(ne % QK5_0 == 0);
471466
const int num_blocks = ne / QK5_0;
@@ -479,7 +474,7 @@ static void ggml_cpy_q5_0_f32_cuda(
479474
const int nb00, const int nb01, const int nb02,
480475
const int nb03, const int ne10, const int ne11, const int ne12,
481476
const int nb10, const int nb11, const int nb12, const int nb13,
482-
cudaStream_t stream, char ** cdst_indirect) {
477+
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
483478
const int num_blocks = ne;
484479
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_0, QK5_0>, QK5_0><<<num_blocks, 1, 0, stream>>>(
485480
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -489,7 +484,7 @@ static void ggml_cpy_q5_0_f32_cuda(
489484
static void ggml_cpy_f32_q5_1_cuda(
490485
const char * cx, char * cdst, const int ne,
491486
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
492-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
487+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
493488

494489
GGML_ASSERT(ne % QK5_1 == 0);
495490
const int num_blocks = ne / QK5_1;
@@ -503,7 +498,7 @@ static void ggml_cpy_q5_1_f32_cuda(
503498
const int nb00, const int nb01, const int nb02,
504499
const int nb03, const int ne10, const int ne11, const int ne12,
505500
const int nb10, const int nb11, const int nb12, const int nb13,
506-
cudaStream_t stream, char ** cdst_indirect) {
501+
cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
507502
const int num_blocks = ne;
508503
cpy_q_f32<cpy_blck_q_f32<dequantize_q5_1, QK5_1>, QK5_1><<<num_blocks, 1, 0, stream>>>(
509504
cx, cdst, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03,
@@ -513,7 +508,7 @@ static void ggml_cpy_q5_1_f32_cuda(
513508
static void ggml_cpy_f32_iq4_nl_cuda(
514509
const char * cx, char * cdst, const int ne,
515510
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
516-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
511+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
517512

518513
GGML_ASSERT(ne % QK4_NL == 0);
519514
const int num_blocks = ne / QK4_NL;
@@ -524,7 +519,7 @@ static void ggml_cpy_f32_iq4_nl_cuda(
524519
static void ggml_cpy_f16_f16_cuda(
525520
const char * cx, char * cdst, const int ne,
526521
const int ne00, const int ne01, const int ne02, const int nb00, const int nb01, const int nb02,
527-
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect) {
522+
const int nb03, const int ne10, const int ne11, const int ne12, const int nb10, const int nb11, const int nb12, const int nb13, cudaStream_t stream, char ** cdst_indirect, int & graph_cpynode_index) {
528523

529524
const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
530525
cpy_f32_f16<cpy_1_f16_f16><<<num_blocks, CUDA_CPY_BLOCK_SIZE, 0, stream>>>
@@ -569,38 +564,38 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
569564
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
570565
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
571566
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
572-
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
567+
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
573568
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
574-
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
569+
ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
575570
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
576-
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
571+
ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
577572
} else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) {
578-
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
573+
ggml_cpy_q8_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
579574
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
580-
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
575+
ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
581576
} else if (src0->type == GGML_TYPE_Q4_0 && src1->type == GGML_TYPE_F32) {
582577
ggml_cpy_q4_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
583-
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
578+
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
584579
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
585-
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
580+
ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
586581
} else if (src0->type == GGML_TYPE_Q4_1 && src1->type == GGML_TYPE_F32) {
587582
ggml_cpy_q4_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
588-
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
583+
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
589584
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
590-
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
585+
ggml_cpy_f32_q5_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
591586
} else if (src0->type == GGML_TYPE_Q5_0 && src1->type == GGML_TYPE_F32) {
592587
ggml_cpy_q5_0_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02,
593-
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
588+
nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
594589
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
595-
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
590+
ggml_cpy_f32_iq4_nl_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
596591
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
597-
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
592+
ggml_cpy_f32_q5_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
598593
} else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) {
599-
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
594+
ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
600595
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
601-
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
596+
ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
602597
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
603-
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs);
598+
ggml_cpy_f16_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, ctx.cuda_graph->dest_ptrs_d, ctx.cuda_graph->graph_cpynode_index);
604599
} else {
605600
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
606601
ggml_type_name(src0->type), ggml_type_name(src1->type));

ggml/src/ggml-cuda/cpy.cuh

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,5 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
77
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
88

99
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
10+
11+
void ggml_backend_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size);

0 commit comments

Comments
 (0)