@@ -19,13 +19,15 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
1919
2020 auto cmd_l = context.command_list ;
2121 auto cmd_q = context.command_queue ;
22+ auto device = context.device ;
2223
2324 if (context.copy_command_queue ) {
2425 cmd_l = context.copy_command_list ;
2526 cmd_q = context.copy_command_queue ;
2627 } else if (context.sub_device_count ) {
2728 cmd_l = context.cmd_list [current_sub_device_id];
2829 cmd_q = context.cmd_queue [current_sub_device_id];
30+ device = context.sub_devices [current_sub_device_id];
2931 }
3032
3133 SUCCESS_OR_TERMINATE (zeCommandListReset (cmd_l));
@@ -53,6 +55,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
5355 return calculate_gbps (timed, static_cast <long double >(buffer_size));
5456}
5557
58+ long double ZePeak::_transfer_bw_gpu_copy_with_shared_system (
59+ L0Context &context, void *destination_buffer, void *source_buffer,
60+ void *input_buffer, size_t buffer_size, bool is_source_system,
61+ MemoryAdvice advice) {
62+ Timer<std::chrono::nanoseconds::period> timer;
63+ Timer<std::chrono::nanoseconds::period> host_timer;
64+ long double gbps = 0 ;
65+ ze_result_t result = ZE_RESULT_SUCCESS;
66+
67+ auto cmd_l = context.command_list ;
68+ auto cmd_q = context.command_queue ;
69+ auto device = context.device ;
70+
71+ if (context.copy_command_queue ) {
72+ cmd_l = context.copy_command_list ;
73+ cmd_q = context.copy_command_queue ;
74+ } else if (context.sub_device_count ) {
75+ cmd_l = context.cmd_list [current_sub_device_id];
76+ cmd_q = context.cmd_queue [current_sub_device_id];
77+ device = context.sub_devices [current_sub_device_id];
78+ }
79+
80+ SUCCESS_OR_TERMINATE (zeCommandListReset (cmd_l));
81+ switch (advice) {
82+ case MemoryAdvice::SourceToSystem:
83+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
84+ cmd_l, device, source_buffer, buffer_size,
85+ ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
86+ break ;
87+ case MemoryAdvice::DestinationToSystem:
88+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
89+ cmd_l, device, destination_buffer, buffer_size,
90+ ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
91+ break ;
92+ }
93+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemoryCopy (cmd_l, destination_buffer,
94+ source_buffer, buffer_size,
95+ nullptr , 0 , nullptr ));
96+ SUCCESS_OR_TERMINATE (zeCommandListClose (cmd_l));
97+
98+ for (uint32_t i = 0 ; i < warmup_iterations; i++) {
99+ SUCCESS_OR_TERMINATE (
100+ zeCommandQueueExecuteCommandLists (cmd_q, 1 , &cmd_l, nullptr ));
101+ SUCCESS_OR_TERMINATE (zeCommandQueueSynchronize (cmd_q, UINT64_MAX));
102+ }
103+
104+ timer.start ();
105+ for (uint32_t i = 0 ; i < iters; i++) {
106+ SUCCESS_OR_TERMINATE (
107+ zeCommandQueueExecuteCommandLists (cmd_q, 1 , &cmd_l, nullptr ));
108+ SUCCESS_OR_TERMINATE (zeCommandQueueSynchronize (cmd_q, UINT64_MAX));
109+
110+ host_timer.start ();
111+ if (is_source_system) {
112+ memset (source_buffer, 0 , buffer_size);
113+ } else {
114+ memset (destination_buffer, 0 , buffer_size);
115+ }
116+ host_timer.end ();
117+ }
118+ timer.end ();
119+ long double timed =
120+ timer.period_minus_overhead () - host_timer.period_minus_overhead ();
121+ timed /= static_cast <long double >(iters);
122+
123+ switch (advice) {
124+ case MemoryAdvice::SourceToSystem:
125+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
126+ cmd_l, device, source_buffer, buffer_size,
127+ ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
128+ break ;
129+ case MemoryAdvice::DestinationToSystem:
130+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
131+ cmd_l, device, destination_buffer, buffer_size,
132+ ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
133+ break ;
134+ }
135+
136+ return calculate_gbps (timed, static_cast <long double >(buffer_size));
137+ }
138+
56139long double ZePeak::_transfer_bw_host_copy (L0Context &context,
57140 void *destination_buffer,
58141 void *source_buffer,
@@ -364,8 +447,94 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
364447 std::cout << " enqueueReadBuffer : " ;
365448 std::cout << gflops << " GB/s\n " ;
366449
367- current_sub_device_id = 0 ;
450+ if ((context.device_memory_access_property .sharedSystemAllocCapabilities &
451+ ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0 ) {
452+ void *system_memory = malloc (local_memory_size);
453+ if (system_memory == nullptr ) {
454+ throw std::runtime_error (" malloc failed" );
455+ }
456+ memcpy (system_memory, local_memory, local_memory_size);
457+
458+ gflops = 0 ;
459+ if (context.sub_device_count ) {
460+ current_sub_device_id = 0 ;
461+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
462+ gflops += _transfer_bw_gpu_copy_with_shared_system (
463+ context, dev_out_buf[i], system_memory, local_memory,
464+ local_memory_size / context.sub_device_count , true );
465+ current_sub_device_id++;
466+ }
467+ gflops = gflops / context.sub_device_count ;
468+ } else {
469+ gflops = _transfer_bw_gpu_copy_with_shared_system (
470+ context, device_buffer, system_memory, local_memory,
471+ local_memory_size, true );
472+ }
473+ std::cout << " GPU Copy Shared System Memory to Shared Memory : " ;
474+ std::cout << gflops << " GB/s\n " ;
475+
476+ gflops = 0 ;
477+ if (context.sub_device_count ) {
478+ current_sub_device_id = 0 ;
479+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
480+ gflops += _transfer_bw_gpu_copy_with_shared_system (
481+ context, system_memory, dev_out_buf[i], local_memory,
482+ local_memory_size / context.sub_device_count , false );
483+ current_sub_device_id++;
484+ }
485+ gflops = gflops / context.sub_device_count ;
486+ } else {
487+ gflops = _transfer_bw_gpu_copy_with_shared_system (
488+ context, system_memory, device_buffer, local_memory,
489+ local_memory_size, false );
490+ }
491+ std::cout << " GPU Copy Shared System Memory from Shared Memory : " ;
492+ std::cout << gflops << " GB/s\n " ;
493+
494+ gflops = 0 ;
495+ if (context.sub_device_count ) {
496+ current_sub_device_id = 0 ;
497+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
498+ gflops += _transfer_bw_gpu_copy_with_shared_system (
499+ context, dev_out_buf[i], system_memory, local_memory,
500+ local_memory_size / context.sub_device_count , true ,
501+ MemoryAdvice::SourceToSystem);
502+ current_sub_device_id++;
503+ }
504+ gflops = gflops / context.sub_device_count ;
505+ } else {
506+ gflops = _transfer_bw_gpu_copy_with_shared_system (
507+ context, device_buffer, system_memory, local_memory,
508+ local_memory_size, true , MemoryAdvice::SourceToSystem);
509+ }
510+ std::cout << " GPU Copy Shared System Memory to Shared Memory with Memory "
511+ " Advice : " ;
512+ std::cout << gflops << " GB/s\n " ;
513+
514+ gflops = 0 ;
515+ if (context.sub_device_count ) {
516+ current_sub_device_id = 0 ;
517+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
518+ gflops += _transfer_bw_gpu_copy_with_shared_system (
519+ context, system_memory, dev_out_buf[i], local_memory,
520+ local_memory_size / context.sub_device_count , false ,
521+ MemoryAdvice::DestinationToSystem);
522+ current_sub_device_id++;
523+ }
524+ gflops = gflops / context.sub_device_count ;
525+ } else {
526+ gflops = _transfer_bw_gpu_copy_with_shared_system (
527+ context, system_memory, device_buffer, local_memory,
528+ local_memory_size, false , MemoryAdvice::DestinationToSystem);
529+ }
530+ std::cout << " GPU Copy Shared System Memory from Shared Memory with Memory "
531+ " Advice : " ;
532+ std::cout << gflops << " GB/s\n " ;
368533
534+ free (system_memory);
535+ }
536+
537+ current_sub_device_id = 0 ;
369538 _transfer_bw_shared_memory (context, local_memory_size, local_memory);
370539
371540 if (context.sub_device_count ) {
0 commit comments