@@ -191,7 +191,6 @@ void use(const cuda::device_t &device, const cuda::graph::template_t &graph, con
191191 cloned_graph_instance.launch (stream_for_graph);
192192 }
193193 stream_for_graph.synchronize ();
194- std::cout << std::endl;
195194}
196195
197196void cudaGraphsManual (
@@ -205,6 +204,12 @@ void cudaGraphsManual(
205204 report_mode (graph_construction_mode);
206205 report_attempt (" construction" , graph_construction_mode);
207206 double result_h = 0.0 ;
207+ // This ugly hack is due to a problem with the CUDA graph API, and specifically
208+ // with kernel launch node insertion: When you insert a node into a template,
209+ // the kernel argument values are _not_ copied by the CUDA driver, nor apparently
210+ // is the array of pointers to them. So, we need to extend the lifetime of this
211+ // information until after the graph template is instantiated.
212+ std::vector<std::vector<void *>> argument_ptr_sequences{};
208213
209214 using node_kind_t = cuda::graph::node::kind_t ;
210215 auto graph = cuda::graph::create ();
@@ -238,9 +243,9 @@ void cudaGraphsManual(
238243 .grid_size (outputVec_d.size ())
239244 .block_size (THREADS_PER_BLOCK)
240245 .build ();
241- auto kernel_arg_pointers = cuda::graph::make_kernel_argument_pointers (
242- inputVec_d.data (), outputVec_d.data (), inputVec_d.size (), outputVec_d.size ());
243- auto kernel_node_args = cuda::graph::make_launch_primed_kernel (reduce_kernel, launch_config, kernel_arg_pointers );
246+ argument_ptr_sequences. emplace_back ( cuda::graph::make_kernel_argument_pointers (
247+ inputVec_d.data (), outputVec_d.data (), inputVec_d.size (), outputVec_d.size ())) ;
248+ auto kernel_node_args = cuda::graph::make_launch_primed_kernel (reduce_kernel, launch_config, argument_ptr_sequences. back () );
244249 return graph.insert .node <node_kind_t ::kernel_launch>(kernel_node_args);
245250 }();
246251
@@ -261,8 +266,8 @@ void cudaGraphsManual(
261266 .grid_size (1 )
262267 .block_size (THREADS_PER_BLOCK)
263268 .build ();
264- auto arg_ptrs = cuda::graph::make_kernel_argument_pointers (outputVec_d.data (), result_d.data (), outputVec_d.size ());
265- return graph.insert .node <node_kind_t ::kernel_launch>(kernel, launch_config, arg_ptrs );
269+ argument_ptr_sequences. emplace_back ( cuda::graph::make_kernel_argument_pointers (outputVec_d.data (), result_d.data (), outputVec_d.size () ));
270+ return graph.insert .node <node_kind_t ::kernel_launch>(kernel, launch_config, argument_ptr_sequences. back () );
266271 }();
267272
268273 graph.insert .edge (reduce_node, reduce_final_node);
0 commit comments