3434#include < numeric>
3535
3636#if __cplusplus >= 202001L
37- using span = :: std::span;
37+ using span = std::span;
3838#else
3939using cuda::span;
4040#endif
@@ -131,14 +131,14 @@ __global__ void reduceFinal(double *inputVec, double *result, size_t inputSize)
131131 if (cta.thread_rank () == 0 ) result[0 ] = temp_sum;
132132}
133133
134- void init_input (cuda:: span<float > a) {
134+ void init_input (span<float > a) {
135135 auto generator = []() { return static_cast <float >(rand () & 0xFF ) / static_cast <float >(RAND_MAX); };
136- :: std::generate_n (a.data(), a.size(), generator);
136+ std::generate_n (a.data (), a.size (), generator);
137137}
138138
139139void myRealHostNodeCallback (char const *graph_construction_mode, double result)
140140{
141- :: std::cout << " Host callback in graph constructed by " << graph_construction_mode << " : result = " << result << :: std::endl;
141+ std::cout << " Host callback in graph constructed by " << graph_construction_mode << " : result = " << result << std::endl;
142142 result = 0.0 ; // reset the result
143143}
144144
@@ -152,15 +152,20 @@ void CUDART_CB myHostNodeCallback(void *type_erased_data)
152152
153153void report_attempt (const char * attempt_kind, const char * how_created)
154154{
155- ::std::cout << ' \n '
156- << " Attempting " << attempt_kind << " of a CUDA graph, with construction method: " << how_created << ' \n '
157- << " ----------------------------------------------------------------\n " ;
155+ std::cout << " Attempting " << attempt_kind << " of a CUDA graph, with construction method: " << how_created << ' \n ' ;
156+ }
157+
158+ void report_mode (const char * mode)
159+ {
160+ std::cout << ' \n '
161+ << " Graph construction method: " << mode << ' \n '
162+ << " ---------------------------" << std::string (std::strlen (mode), ' -' ) << ' \n ' ;
158163}
159164
160165void use (const cuda::device_t &device, const cuda::graph::template_t &graph, const char * how_created)
161166{
162167 report_attempt (" use" , how_created);
163- :: std::cout << " Number of graph nodes = " << graph.num_nodes () << ' \n ' ;
168+ std::cout << " Number of graph nodes = " << graph.num_nodes () << ' \n ' ;
164169
165170 auto instance = cuda::graph::instantiate (graph);
166171
@@ -169,21 +174,24 @@ void use(const cuda::device_t &device, const cuda::graph::template_t &graph, con
169174
170175 auto stream_for_graph = cuda::stream::create (device, cuda::stream::async);
171176
177+ std::ostringstream sstr;
172178 for (int i = 0 ; i < GRAPH_LAUNCH_ITERATIONS; i++) {
173- ::std::cout
174- << " Launching an instance of the original graph: launch "
175- << (i+1 ) << " of " << GRAPH_LAUNCH_ITERATIONS << ::std::endl;
179+ sstr.str (" " );
180+ sstr << " Launching an instance of the original graph: launch "
181+ << (i+1 ) << " of " << GRAPH_LAUNCH_ITERATIONS << ' \n ' ;
182+ std::cout << sstr.str () << std::flush;
176183 instance.launch (stream_for_graph);
177184 }
178185
179186 for (int i = 0 ; i < GRAPH_LAUNCH_ITERATIONS; i++) {
180- ::std::cout
181- << " Launching an instance of the cloned graph: launch "
182- << (i+1 ) << " of " << GRAPH_LAUNCH_ITERATIONS << ::std::endl;
187+ sstr.str (" " );
188+ sstr << " Launching an instance of the cloned graph: launch "
189+ << (i+1 ) << " of " << GRAPH_LAUNCH_ITERATIONS << std::endl;
190+ std::cout << sstr.str () << std::flush;
183191 cloned_graph_instance.launch (stream_for_graph);
184192 }
185- ::std::cout << ::std::endl;
186193 stream_for_graph.synchronize ();
194+ std::cout << std::endl;
187195}
188196
189197void cudaGraphsManual (
@@ -194,6 +202,7 @@ void cudaGraphsManual(
194202 span<double > result_d)
195203{
196204 const char * graph_construction_mode = " explicit node and edge insertion calls" ;
205+ report_mode (graph_construction_mode);
197206 report_attempt (" construction" , graph_construction_mode);
198207 double result_h = 0.0 ;
199208
@@ -273,7 +282,7 @@ void cudaGraphsManual(
273282
274283 graph.insert .edge (reduce_final_node, memcpy_result_node);
275284
276- auto host_function_data = :: std::make_pair (graph_construction_mode, &result_h);
285+ auto host_function_data = std::make_pair (graph_construction_mode, &result_h);
277286 auto host_function_node = graph.insert .node <node_kind_t ::host_function_call>(myHostNodeCallback, &host_function_data);
278287
279288 graph.insert .edge (memcpy_result_node, host_function_node);
@@ -291,6 +300,7 @@ void cudaGraphsManualWithBuilders(
291300 span<double > result_d)
292301{
293302 const char * graph_construction_mode = " use of node builders and explicit edge insertions" ;
303+ report_mode (graph_construction_mode);
294304 report_attempt (" construction" , graph_construction_mode);
295305 double result_h = 0.0 ;
296306
@@ -300,7 +310,7 @@ void cudaGraphsManualWithBuilders(
300310 // what about building via the graph object?
301311 cuda::graph::node::builder_t builder;
302312
303- :: std::cout << " Building a memcpy node" << :: std::endl;
313+ std::cout << " Building a memcpy node" << std::endl;
304314
305315 // TODO: Consider having builder_t::memory_copy , builder_t::memory_set etc.
306316 auto memcpy_node = builder.kind <node_kind_t ::memory_copy>()
@@ -315,14 +325,14 @@ void cudaGraphsManualWithBuilders(
315325 // .context(cuda::context::current::get())
316326 .build_within (graph);
317327
318- :: std::cout << " Building a memset node" << :: std::endl;
328+ std::cout << " Building a memset node" << std::endl;
319329
320330 auto memset_node = builder.kind <node_kind_t ::memory_set>()
321331 .region (inputVec_d)
322332 .value <float >(0 )
323333 .build_within (graph);
324334
325- :: std::cout << " Building a kernel launch node" << :: std::endl;
335+ std::cout << " Building a kernel launch node" << std::endl;
326336
327337
328338 auto wrapped_reduce_kernel = cuda::kernel::get (device, reduce);
@@ -343,15 +353,15 @@ void cudaGraphsManualWithBuilders(
343353 graph.insert .edge (memcpy_node, reduce_node);
344354 graph.insert .edge (memset_node, reduce_node);
345355
346- :: std::cout << " Building a memset node" << :: std::endl;
356+ std::cout << " Building a memset node" << std::endl;
347357
348358 auto memset_result_node = builder.kind <node_kind_t ::memory_set>()
349359 .region (result_d)
350360 .value <float >(0 )
351361 .build_within (graph);
352362
353363
354- :: std::cout << " Building a kernel launch node" << :: std::endl;
364+ std::cout << " Building a kernel launch node" << std::endl;
355365
356366 auto final_reduce_launch_config = cuda::launch_config_builder ()
357367 .grid_size (1 )
@@ -369,18 +379,18 @@ void cudaGraphsManualWithBuilders(
369379 graph.insert .edge (reduce_node, reduce_final_node);
370380 graph.insert .edge (memset_result_node, reduce_final_node);
371381
372- :: std::cout << " Building a memcpy node" << :: std::endl;
382+ std::cout << " Building a memcpy node" << std::endl;
373383
374384 auto memcpy_result_node = builder.kind <node_kind_t ::memory_copy>()
375385 .source (result_d)
376- .destination (cuda:: span<double >{&result_h,1 })
386+ .destination (span<double >{&result_h,1 })
377387 .build_within (graph);
378388
379389 graph.insert .edge (reduce_final_node, memcpy_result_node);
380390
381- auto host_function_data = :: std::make_pair (graph_construction_mode, &result_h);
391+ auto host_function_data = std::make_pair (graph_construction_mode, &result_h);
382392
383- :: std::cout << " Building a host function node" << :: std::endl;
393+ std::cout << " Building a host function node" << std::endl;
384394
385395 auto host_function_node = builder.kind <node_kind_t ::host_function_call>()
386396 .argument (&host_function_data)
@@ -400,6 +410,7 @@ void cudaGraphsUsingStreamCapture(
400410 span<double > result_d)
401411{
402412 const char * graph_construction_mode = " stream capture" ;
413+ report_mode (graph_construction_mode);
403414 report_attempt (" construction" , graph_construction_mode);
404415 double result_h = 0.0 ;
405416
@@ -455,9 +466,9 @@ void cudaGraphsUsingStreamCapture(
455466 use (device, graph, graph_construction_mode);
456467}
457468
458- [[noreturn]] bool die_ (const :: std::string& message)
469+ [[noreturn]] bool die_ (const std::string& message)
459470{
460- :: std::cerr << message << " \n " ;
471+ std::cerr << message << " \n " ;
461472 exit (EXIT_FAILURE);
462473}
463474
@@ -472,15 +483,15 @@ int main(int argc, char **argv)
472483
473484 // Being very cavalier about our command-line arguments here...
474485 cuda::device::id_t device_id = (argc > 1 ) ?
475- :: std::stoi (argv[1 ]) : cuda::device::default_device_id;
486+ std::stoi (argv[1 ]) : cuda::device::default_device_id;
476487
477488 auto device = cuda::device::get (device_id);
478489
479- :: std::cout
490+ std::cout
480491 << size << " elements\n "
481492 << " threads per block = " << THREADS_PER_BLOCK << ' \n '
482493 << " Graph Launch iterations = " << GRAPH_LAUNCH_ITERATIONS << ' \n '
483- << :: std::flush;
494+ << std::flush;
484495
485496 auto inputVec_h = cuda::memory::host::make_unique_span<float >(size);
486497 auto inputVec_d = cuda::memory::device::make_unique_span<float >(device, size);
@@ -489,23 +500,18 @@ int main(int argc, char **argv)
489500
490501 init_input (inputVec_h);
491502
492- auto result_verification = :: std::accumulate (
503+ auto result_verification = std::accumulate (
493504#if __cplusplus >= 201712L
494- :: std::execution::par_unseq,
505+ std::execution::par_unseq,
495506#endif
496507 inputVec_h.begin (), inputVec_h.end (), 0.0 );
497- :: std::cout << " Expected result = " << result_verification << ' \n ' ;
508+ std::cout << " Expected result = " << result_verification << ' \n ' ;
498509
499510 device.synchronize ();
500-
501511 cudaGraphsManual (device, inputVec_h, inputVec_d, outputVec_d, result_d);
502-
503512 cudaGraphsUsingStreamCapture (device, inputVec_h, inputVec_d, outputVec_d, result_d);
504-
505513 device.synchronize ();
506- ::std::flush (std::cout);
507-
508514 cudaGraphsManualWithBuilders (device, inputVec_h, inputVec_d, outputVec_d, result_d);
509515
510- :: std::cout << " \n \n SUCCESS\n " ;
516+ std::cout << " \n SUCCESS\n " ;
511517}
0 commit comments