Skip to content

Commit 6a92cce

Browse files
committed
Circumvents/fixes #672: In the simpleCudaGraphs example, keeping alive information the CUDA graph template instantiation relies on until after the instantiation has occurred (which we shouldn't have to do, except for the CUDA driver's unwillingness to copy relevant data)
Also, removed a redundant line break printing command.
1 parent 279e86e commit 6a92cce

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,6 @@ void use(const cuda::device_t &device, const cuda::graph::template_t &graph, con
191191
cloned_graph_instance.launch(stream_for_graph);
192192
}
193193
stream_for_graph.synchronize();
194-
std::cout << std::endl;
195194
}
196195

197196
void cudaGraphsManual(
@@ -205,6 +204,12 @@ void cudaGraphsManual(
205204
report_mode(graph_construction_mode);
206205
report_attempt("construction", graph_construction_mode);
207206
double result_h = 0.0;
207+
// This ugly hack is due to a problem with the CUDA graph API, and specifically
208+
// with kernel launch node insertion: When you insert a node into a template,
209+
// the kernel argument values are _not_ copied by the CUDA driver, nor apparently
210+
// is the array of pointers to them. So, we need to extend the lifetime of this
211+
// information until after the graph template is instantiated.
212+
std::vector<std::vector<void*>> argument_ptr_sequences{};
208213

209214
using node_kind_t = cuda::graph::node::kind_t;
210215
auto graph = cuda::graph::create();
@@ -238,9 +243,9 @@ void cudaGraphsManual(
238243
.grid_size(outputVec_d.size())
239244
.block_size(THREADS_PER_BLOCK)
240245
.build();
241-
auto kernel_arg_pointers = cuda::graph::make_kernel_argument_pointers(
242-
inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size());
243-
auto kernel_node_args = cuda::graph::make_launch_primed_kernel(reduce_kernel, launch_config, kernel_arg_pointers);
246+
argument_ptr_sequences.emplace_back(cuda::graph::make_kernel_argument_pointers(
247+
inputVec_d.data(), outputVec_d.data(), inputVec_d.size(), outputVec_d.size()));
248+
auto kernel_node_args = cuda::graph::make_launch_primed_kernel(reduce_kernel, launch_config, argument_ptr_sequences.back());
244249
return graph.insert.node<node_kind_t::kernel_launch>(kernel_node_args);
245250
}();
246251

@@ -261,8 +266,8 @@ void cudaGraphsManual(
261266
.grid_size(1)
262267
.block_size(THREADS_PER_BLOCK)
263268
.build();
264-
auto arg_ptrs = cuda::graph::make_kernel_argument_pointers(outputVec_d.data(), result_d.data(), outputVec_d.size());
265-
return graph.insert.node<node_kind_t::kernel_launch>(kernel, launch_config, arg_ptrs);
269+
argument_ptr_sequences.emplace_back(cuda::graph::make_kernel_argument_pointers(outputVec_d.data(), result_d.data(), outputVec_d.size()));
270+
return graph.insert.node<node_kind_t::kernel_launch>(kernel, launch_config, argument_ptr_sequences.back());
266271
}();
267272

268273
graph.insert.edge(reduce_node, reduce_final_node);

0 commit comments

Comments
 (0)