Skip to content

Commit c448f8e

Browse files
committed
simpleCudaGraphs example program: reworkd some code printing to the standard output
1 parent 46dac16 commit c448f8e

File tree

1 file changed

+45
-39
lines changed

1 file changed

+45
-39
lines changed

examples/modified_cuda_samples/simpleCudaGraphs/simpleCudaGraphs.cu

Lines changed: 45 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
#include <numeric>
3535

3636
#if __cplusplus >= 202001L
37-
using span = ::std::span;
37+
using span = std::span;
3838
#else
3939
using cuda::span;
4040
#endif
@@ -131,14 +131,14 @@ __global__ void reduceFinal(double *inputVec, double *result, size_t inputSize)
131131
if (cta.thread_rank() == 0) result[0] = temp_sum;
132132
}
133133

134-
void init_input(cuda::span<float> a) {
134+
void init_input(span<float> a) {
135135
auto generator = []() { return static_cast<float>(rand() & 0xFF) / static_cast<float>(RAND_MAX); };
136-
::std::generate_n(a.data(), a.size(), generator);
136+
std::generate_n(a.data(), a.size(), generator);
137137
}
138138

139139
void myRealHostNodeCallback(char const *graph_construction_mode, double result)
140140
{
141-
::std::cout << "Host callback in graph constructed by " << graph_construction_mode << ": result = " << result << ::std::endl;
141+
std::cout << "Host callback in graph constructed by " << graph_construction_mode << ": result = " << result << std::endl;
142142
result = 0.0; // reset the result
143143
}
144144

@@ -152,15 +152,20 @@ void CUDART_CB myHostNodeCallback(void *type_erased_data)
152152

153153
void report_attempt(const char* attempt_kind, const char* how_created)
154154
{
155-
::std::cout << '\n'
156-
<< "Attempting " << attempt_kind << " of a CUDA graph, with construction method: " << how_created << '\n'
157-
<< "----------------------------------------------------------------\n";
155+
std::cout << "Attempting " << attempt_kind << " of a CUDA graph, with construction method: " << how_created << '\n';
156+
}
157+
158+
void report_mode(const char* mode)
159+
{
160+
std::cout << '\n'
161+
<< "Graph construction method: " << mode << '\n'
162+
<< "---------------------------" << std::string(std::strlen(mode), '-') << '\n';
158163
}
159164

160165
void use(const cuda::device_t &device, const cuda::graph::template_t &graph, const char* how_created)
161166
{
162167
report_attempt("use", how_created);
163-
::std::cout << "Number of graph nodes = " << graph.num_nodes() << '\n';
168+
std::cout << "Number of graph nodes = " << graph.num_nodes() << '\n';
164169

165170
auto instance = cuda::graph::instantiate(graph);
166171

@@ -169,21 +174,24 @@ void use(const cuda::device_t &device, const cuda::graph::template_t &graph, con
169174

170175
auto stream_for_graph = cuda::stream::create(device, cuda::stream::async);
171176

177+
std::ostringstream sstr;
172178
for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
173-
::std::cout
174-
<< "Launching an instance of the original graph: launch "
175-
<< (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << ::std::endl;
179+
sstr.str("");
180+
sstr << "Launching an instance of the original graph: launch "
181+
<< (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << '\n';
182+
std::cout << sstr.str() << std::flush;
176183
instance.launch(stream_for_graph);
177184
}
178185

179186
for (int i = 0; i < GRAPH_LAUNCH_ITERATIONS; i++) {
180-
::std::cout
181-
<< "Launching an instance of the cloned graph: launch "
182-
<< (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << ::std::endl;
187+
sstr.str("");
188+
sstr << "Launching an instance of the cloned graph: launch "
189+
<< (i+1) << " of " << GRAPH_LAUNCH_ITERATIONS << std::endl;
190+
std::cout << sstr.str() << std::flush;
183191
cloned_graph_instance.launch(stream_for_graph);
184192
}
185-
::std::cout << ::std::endl;
186193
stream_for_graph.synchronize();
194+
std::cout << std::endl;
187195
}
188196

189197
void cudaGraphsManual(
@@ -194,6 +202,7 @@ void cudaGraphsManual(
194202
span<double> result_d)
195203
{
196204
const char* graph_construction_mode = "explicit node and edge insertion calls";
205+
report_mode(graph_construction_mode);
197206
report_attempt("construction", graph_construction_mode);
198207
double result_h = 0.0;
199208

@@ -273,7 +282,7 @@ void cudaGraphsManual(
273282

274283
graph.insert.edge(reduce_final_node, memcpy_result_node);
275284

276-
auto host_function_data = ::std::make_pair(graph_construction_mode, &result_h);
285+
auto host_function_data = std::make_pair(graph_construction_mode, &result_h);
277286
auto host_function_node = graph.insert.node<node_kind_t::host_function_call>(myHostNodeCallback, &host_function_data);
278287

279288
graph.insert.edge(memcpy_result_node, host_function_node);
@@ -291,6 +300,7 @@ void cudaGraphsManualWithBuilders(
291300
span<double> result_d)
292301
{
293302
const char* graph_construction_mode = "use of node builders and explicit edge insertions";
303+
report_mode(graph_construction_mode);
294304
report_attempt("construction", graph_construction_mode);
295305
double result_h = 0.0;
296306

@@ -300,7 +310,7 @@ void cudaGraphsManualWithBuilders(
300310
// what about building via the graph object?
301311
cuda::graph::node::builder_t builder;
302312

303-
::std::cout << "Building a memcpy node" << ::std::endl;
313+
std::cout << "Building a memcpy node" << std::endl;
304314

305315
// TODO: Consider having builder_t::memory_copy , builder_t::memory_set etc.
306316
auto memcpy_node = builder.kind<node_kind_t::memory_copy>()
@@ -315,14 +325,14 @@ void cudaGraphsManualWithBuilders(
315325
// .context(cuda::context::current::get())
316326
.build_within(graph);
317327

318-
::std::cout << "Building a memset node" << ::std::endl;
328+
std::cout << "Building a memset node" << std::endl;
319329

320330
auto memset_node = builder.kind<node_kind_t::memory_set>()
321331
.region(inputVec_d)
322332
.value<float>(0)
323333
.build_within(graph);
324334

325-
::std::cout << "Building a kernel launch node" << ::std::endl;
335+
std::cout << "Building a kernel launch node" << std::endl;
326336

327337

328338
auto wrapped_reduce_kernel = cuda::kernel::get(device, reduce);
@@ -343,15 +353,15 @@ void cudaGraphsManualWithBuilders(
343353
graph.insert.edge(memcpy_node, reduce_node);
344354
graph.insert.edge(memset_node, reduce_node);
345355

346-
::std::cout << "Building a memset node" << ::std::endl;
356+
std::cout << "Building a memset node" << std::endl;
347357

348358
auto memset_result_node = builder.kind<node_kind_t::memory_set>()
349359
.region(result_d)
350360
.value<float>(0)
351361
.build_within(graph);
352362

353363

354-
::std::cout << "Building a kernel launch node" << ::std::endl;
364+
std::cout << "Building a kernel launch node" << std::endl;
355365

356366
auto final_reduce_launch_config = cuda::launch_config_builder()
357367
.grid_size(1)
@@ -369,18 +379,18 @@ void cudaGraphsManualWithBuilders(
369379
graph.insert.edge(reduce_node, reduce_final_node);
370380
graph.insert.edge(memset_result_node, reduce_final_node);
371381

372-
::std::cout << "Building a memcpy node" << ::std::endl;
382+
std::cout << "Building a memcpy node" << std::endl;
373383

374384
auto memcpy_result_node = builder.kind<node_kind_t::memory_copy>()
375385
.source(result_d)
376-
.destination(cuda::span<double>{&result_h,1})
386+
.destination(span<double>{&result_h,1})
377387
.build_within(graph);
378388

379389
graph.insert.edge(reduce_final_node, memcpy_result_node);
380390

381-
auto host_function_data = ::std::make_pair(graph_construction_mode, &result_h);
391+
auto host_function_data = std::make_pair(graph_construction_mode, &result_h);
382392

383-
::std::cout << "Building a host function node" << ::std::endl;
393+
std::cout << "Building a host function node" << std::endl;
384394

385395
auto host_function_node = builder.kind<node_kind_t::host_function_call>()
386396
.argument(&host_function_data)
@@ -400,6 +410,7 @@ void cudaGraphsUsingStreamCapture(
400410
span<double> result_d)
401411
{
402412
const char* graph_construction_mode = "stream capture";
413+
report_mode(graph_construction_mode);
403414
report_attempt("construction", graph_construction_mode);
404415
double result_h = 0.0;
405416

@@ -455,9 +466,9 @@ void cudaGraphsUsingStreamCapture(
455466
use(device, graph, graph_construction_mode);
456467
}
457468

458-
[[noreturn]] bool die_(const ::std::string& message)
469+
[[noreturn]] bool die_(const std::string& message)
459470
{
460-
::std::cerr << message << "\n";
471+
std::cerr << message << "\n";
461472
exit(EXIT_FAILURE);
462473
}
463474

@@ -472,15 +483,15 @@ int main(int argc, char **argv)
472483

473484
// Being very cavalier about our command-line arguments here...
474485
cuda::device::id_t device_id = (argc > 1) ?
475-
::std::stoi(argv[1]) : cuda::device::default_device_id;
486+
std::stoi(argv[1]) : cuda::device::default_device_id;
476487

477488
auto device = cuda::device::get(device_id);
478489

479-
::std::cout
490+
std::cout
480491
<< size << " elements\n"
481492
<< "threads per block = " << THREADS_PER_BLOCK << '\n'
482493
<< "Graph Launch iterations = " << GRAPH_LAUNCH_ITERATIONS << '\n'
483-
<< ::std::flush;
494+
<< std::flush;
484495

485496
auto inputVec_h = cuda::memory::host::make_unique_span<float>(size);
486497
auto inputVec_d = cuda::memory::device::make_unique_span<float>(device, size);
@@ -489,23 +500,18 @@ int main(int argc, char **argv)
489500

490501
init_input(inputVec_h);
491502

492-
auto result_verification = ::std::accumulate(
503+
auto result_verification = std::accumulate(
493504
#if __cplusplus >= 201712L
494-
::std::execution::par_unseq,
505+
std::execution::par_unseq,
495506
#endif
496507
inputVec_h.begin(), inputVec_h.end(), 0.0);
497-
::std::cout << "Expected result = " << result_verification << '\n';
508+
std::cout << "Expected result = " << result_verification << '\n';
498509

499510
device.synchronize();
500-
501511
cudaGraphsManual(device, inputVec_h, inputVec_d, outputVec_d, result_d);
502-
503512
cudaGraphsUsingStreamCapture(device, inputVec_h, inputVec_d, outputVec_d, result_d);
504-
505513
device.synchronize();
506-
::std::flush(std::cout);
507-
508514
cudaGraphsManualWithBuilders(device, inputVec_h, inputVec_d, outputVec_d, result_d);
509515

510-
::std::cout << "\n\nSUCCESS\n";
516+
std::cout << "\nSUCCESS\n";
511517
}

0 commit comments

Comments
 (0)