@@ -2423,7 +2423,7 @@ static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
24232423
24242424#ifdef USE_CUDA_GRAPH
24252425static bool check_node_graph_compatibility_and_refresh_copy_ops (ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2426- std::vector< void *> & ggml_cuda_cpy_fn_ptrs, bool use_cuda_graph) {
2426+ bool use_cuda_graph) {
24272427
24282428 // Loop over nodes in GGML graph to obtain info needed for CUDA graph
24292429 cuda_ctx->cuda_graph ->cpy_dest_ptrs .clear ();
@@ -2471,10 +2471,6 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
24712471#ifndef NDEBUG
24722472 GGML_LOG_DEBUG (" %s: disabling CUDA graphs due to unsupported copy op\n " , __func__);
24732473#endif
2474- } else {
2475- if (std::find (ggml_cuda_cpy_fn_ptrs.begin (), ggml_cuda_cpy_fn_ptrs.end (), ptr) == ggml_cuda_cpy_fn_ptrs.end ()) {
2476- ggml_cuda_cpy_fn_ptrs.push_back (ptr);
2477- }
24782474 }
24792475 }
24802476
@@ -2600,8 +2596,7 @@ static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
26002596#endif
26012597
26022598static void evaluate_and_capture_cuda_graph (ggml_backend_cuda_context * cuda_ctx, ggml_cgraph * cgraph,
2603- [[maybe_unused]] std::vector<void *> & ggml_cuda_cpy_fn_ptrs, bool & graph_evaluated_or_captured, bool & use_cuda_graph,
2604- bool & cuda_graph_update_required) {
2599+ bool & graph_evaluated_or_captured, bool & use_cuda_graph, bool & cuda_graph_update_required) {
26052600
26062601 while (!graph_evaluated_or_captured) {
26072602 // Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
@@ -2667,10 +2662,6 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
26672662
26682663 ggml_cuda_set_device (cuda_ctx->device );
26692664
2670- // vector of pointers to CUDA cpy kernels, which are required to identify
2671- // kernel parameters which need updated in the graph for each token
2672- std::vector<void *> ggml_cuda_cpy_fn_ptrs;
2673-
26742665#ifdef USE_CUDA_GRAPH
26752666 static const bool disable_cuda_graphs_due_to_env = (getenv (" GGML_CUDA_DISABLE_GRAPHS" ) != nullptr );
26762667
@@ -2704,8 +2695,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
27042695 if (use_cuda_graph) {
27052696 cuda_graph_update_required = is_cuda_graph_update_required (cuda_ctx, cgraph);
27062697
2707- use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops (cuda_ctx, cgraph,
2708- ggml_cuda_cpy_fn_ptrs, use_cuda_graph);
2698+ use_cuda_graph = check_node_graph_compatibility_and_refresh_copy_ops (cuda_ctx, cgraph, use_cuda_graph);
27092699
27102700 // Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
27112701 if (use_cuda_graph && cuda_graph_update_required) {
@@ -2733,7 +2723,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
27332723
27342724 bool graph_evaluated_or_captured = false ;
27352725
2736- evaluate_and_capture_cuda_graph (cuda_ctx, cgraph, ggml_cuda_cpy_fn_ptrs, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
2726+ evaluate_and_capture_cuda_graph (cuda_ctx, cgraph, graph_evaluated_or_captured, use_cuda_graph, cuda_graph_update_required);
27372727
27382728 return GGML_STATUS_SUCCESS;
27392729}
0 commit comments