guard to only use indirection with graphs

agray3 · agray3 · commit 21fae96da024 · 2025-04-01T03:42:50.000-07:00
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -711,6 +711,7 @@ struct ggml_cuda_graph {
     bool disable_due_to_failed_graph_capture = false;
     int number_consecutive_updates = 0;
     std::vector<ggml_graph_node_properties> ggml_graph_properties;
+    bool use_cpy_indirection = false;
     std::vector<char *> cpy_dest_ptrs;
     char ** dest_ptrs_d;
     int dest_ptrs_size = 0;
diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
@@ -566,8 +566,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     char ** dest_ptrs_d = nullptr;
     int graph_cpynode_index = -1;
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
-    dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
-    graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+    if(ctx.cuda_graph->use_cpy_indirection) {
+        dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
+        graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
+    }
 #endif
     if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
         GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
@@ -610,7 +612,9 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
     }
 #if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
-     ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+    if(ctx.cuda_graph->use_cpy_indirection) {
+        ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
+    }
 #endif
 
 }
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2481,6 +2481,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 
     if(use_cuda_graph)
     {
+        cuda_ctx->cuda_graph->use_cpy_indirection = true;
         // copy pointers to GPU so they can be accessed via indirection within CUDA graph
         ggml_backend_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());
     }
@@ -2716,6 +2717,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
         CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
     }
 
+    if (!use_cuda_graph) cuda_ctx->cuda_graph->use_cpy_indirection = false;
+
 #else
     bool use_cuda_graph = false;
     bool cuda_graph_update_required = false;

Original file line number	Diff line number	Diff line change
`@@ -2481,6 +2481,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud`
`2481`	`2481`
`2482`	`2482`	`if(use_cuda_graph)`
`2483`	`2483`	`{`
	`2484`	`+ cuda_ctx->cuda_graph->use_cpy_indirection = true;`
`2484`	`2485`	`// copy pointers to GPU so they can be accessed via indirection within CUDA graph`
`2485`	`2486`	`ggml_backend_dest_ptrs_copy(cuda_ctx->cuda_graph.get(), cuda_ctx->cuda_graph->cpy_dest_ptrs.data(), cuda_ctx->cuda_graph->cpy_dest_ptrs.size(), cuda_ctx->stream());`
`2486`	`2487`	`}`
`@@ -2716,6 +2717,8 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,`
`2716`	`2717`	`CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));`
`2717`	`2718`	`}`
`2718`	`2719`
	`2720`	`+ if (!use_cuda_graph) cuda_ctx->cuda_graph->use_cpy_indirection = false;`
	`2721`	`+`
`2719`	`2722`	`#else`
`2720`	`2723`	`bool use_cuda_graph = false;`
`2721`	`2724`	`bool cuda_graph_update_required = false;`