@@ -53,6 +53,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
5353 return calculate_gbps (timed, static_cast <long double >(buffer_size));
5454}
5555
56+ long double ZePeak::_transfer_bw_gpu_copy_with_shared_system (
57+ L0Context &context, void *destination_buffer, void *source_buffer,
58+ void *input_buffer, size_t buffer_size, bool is_source_system,
59+ MemoryAdvice advice) {
60+ Timer<std::chrono::nanoseconds::period> timer;
61+ Timer<std::chrono::nanoseconds::period> host_timer;
62+ long double gbps = 0 ;
63+ ze_result_t result = ZE_RESULT_SUCCESS;
64+
65+ auto cmd_l = context.command_list ;
66+ auto cmd_q = context.command_queue ;
67+ auto device = context.device ;
68+
69+ if (context.copy_command_queue ) {
70+ cmd_l = context.copy_command_list ;
71+ cmd_q = context.copy_command_queue ;
72+ } else if (context.sub_device_count ) {
73+ cmd_l = context.cmd_list [current_sub_device_id];
74+ cmd_q = context.cmd_queue [current_sub_device_id];
75+ device = context.sub_devices [current_sub_device_id];
76+ }
77+
78+ SUCCESS_OR_TERMINATE (zeCommandListReset (cmd_l));
79+ switch (advice) {
80+ case MemoryAdvice::SourceToSystem:
81+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
82+ cmd_l, device, source_buffer, buffer_size,
83+ ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
84+ break ;
85+ case MemoryAdvice::DestinationToSystem:
86+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
87+ cmd_l, device, destination_buffer, buffer_size,
88+ ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
89+ break ;
90+ }
91+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemoryCopy (cmd_l, destination_buffer,
92+ source_buffer, buffer_size,
93+ nullptr , 0 , nullptr ));
94+ SUCCESS_OR_TERMINATE (zeCommandListClose (cmd_l));
95+
96+ for (uint32_t i = 0 ; i < warmup_iterations; i++) {
97+ SUCCESS_OR_TERMINATE (
98+ zeCommandQueueExecuteCommandLists (cmd_q, 1 , &cmd_l, nullptr ));
99+ SUCCESS_OR_TERMINATE (zeCommandQueueSynchronize (cmd_q, UINT64_MAX));
100+ }
101+
102+ timer.start ();
103+ for (uint32_t i = 0 ; i < iters; i++) {
104+ SUCCESS_OR_TERMINATE (
105+ zeCommandQueueExecuteCommandLists (cmd_q, 1 , &cmd_l, nullptr ));
106+ SUCCESS_OR_TERMINATE (zeCommandQueueSynchronize (cmd_q, UINT64_MAX));
107+
108+ host_timer.start ();
109+ if (is_source_system) {
110+ memset (source_buffer, 0 , buffer_size);
111+ } else {
112+ memset (destination_buffer, 0 , buffer_size);
113+ }
114+ host_timer.end ();
115+ }
116+ timer.end ();
117+ long double timed =
118+ timer.period_minus_overhead () - host_timer.period_minus_overhead ();
119+ timed /= static_cast <long double >(iters);
120+
121+ switch (advice) {
122+ case MemoryAdvice::SourceToSystem:
123+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
124+ cmd_l, device, source_buffer, buffer_size,
125+ ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
126+ break ;
127+ case MemoryAdvice::DestinationToSystem:
128+ SUCCESS_OR_TERMINATE (zeCommandListAppendMemAdvise (
129+ cmd_l, device, destination_buffer, buffer_size,
130+ ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
131+ break ;
132+ }
133+
134+ return calculate_gbps (timed, static_cast <long double >(buffer_size));
135+ }
136+
56137long double ZePeak::_transfer_bw_host_copy (L0Context &context,
57138 void *destination_buffer,
58139 void *source_buffer,
@@ -364,8 +445,94 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
364445 std::cout << " enqueueReadBuffer : " ;
365446 std::cout << gflops << " GB/s\n " ;
366447
367- current_sub_device_id = 0 ;
448+ if ((context.device_memory_access_property .sharedSystemAllocCapabilities &
449+ ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0 ) {
450+ void *system_memory = malloc (local_memory_size);
451+ if (system_memory == nullptr ) {
452+ throw std::runtime_error (" malloc failed" );
453+ }
454+ memcpy (system_memory, local_memory, local_memory_size);
455+
456+ gflops = 0 ;
457+ if (context.sub_device_count ) {
458+ current_sub_device_id = 0 ;
459+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
460+ gflops += _transfer_bw_gpu_copy_with_shared_system (
461+ context, dev_out_buf[i], system_memory, local_memory,
462+ local_memory_size / context.sub_device_count , true );
463+ current_sub_device_id++;
464+ }
465+ gflops = gflops / context.sub_device_count ;
466+ } else {
467+ gflops = _transfer_bw_gpu_copy_with_shared_system (
468+ context, device_buffer, system_memory, local_memory,
469+ local_memory_size, true );
470+ }
471+ std::cout << " GPU Copy Shared System Memory to Shared Memory : " ;
472+ std::cout << gflops << " GB/s\n " ;
473+
474+ gflops = 0 ;
475+ if (context.sub_device_count ) {
476+ current_sub_device_id = 0 ;
477+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
478+ gflops += _transfer_bw_gpu_copy_with_shared_system (
479+ context, system_memory, dev_out_buf[i], local_memory,
480+ local_memory_size / context.sub_device_count , false );
481+ current_sub_device_id++;
482+ }
483+ gflops = gflops / context.sub_device_count ;
484+ } else {
485+ gflops = _transfer_bw_gpu_copy_with_shared_system (
486+ context, system_memory, device_buffer, local_memory,
487+ local_memory_size, false );
488+ }
489+ std::cout << " GPU Copy Shared System Memory from Shared Memory : " ;
490+ std::cout << gflops << " GB/s\n " ;
491+
492+ gflops = 0 ;
493+ if (context.sub_device_count ) {
494+ current_sub_device_id = 0 ;
495+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
496+ gflops += _transfer_bw_gpu_copy_with_shared_system (
497+ context, dev_out_buf[i], system_memory, local_memory,
498+ local_memory_size / context.sub_device_count , true ,
499+ MemoryAdvice::SourceToSystem);
500+ current_sub_device_id++;
501+ }
502+ gflops = gflops / context.sub_device_count ;
503+ } else {
504+ gflops = _transfer_bw_gpu_copy_with_shared_system (
505+ context, device_buffer, system_memory, local_memory,
506+ local_memory_size, true , MemoryAdvice::SourceToSystem);
507+ }
508+ std::cout << " GPU Copy Shared System Memory to Shared Memory with Memory "
509+ " Advice : " ;
510+ std::cout << gflops << " GB/s\n " ;
511+
512+ gflops = 0 ;
513+ if (context.sub_device_count ) {
514+ current_sub_device_id = 0 ;
515+ for (uint32_t i = 0U ; i < context.sub_device_count ; i++) {
516+ gflops += _transfer_bw_gpu_copy_with_shared_system (
517+ context, system_memory, dev_out_buf[i], local_memory,
518+ local_memory_size / context.sub_device_count , false ,
519+ MemoryAdvice::DestinationToSystem);
520+ current_sub_device_id++;
521+ }
522+ gflops = gflops / context.sub_device_count ;
523+ } else {
524+ gflops = _transfer_bw_gpu_copy_with_shared_system (
525+ context, system_memory, device_buffer, local_memory,
526+ local_memory_size, false , MemoryAdvice::DestinationToSystem);
527+ }
528+ std::cout << " GPU Copy Shared System Memory from Shared Memory with Memory "
529+ " Advice : " ;
530+ std::cout << gflops << " GB/s\n " ;
368531
532+ free (system_memory);
533+ }
534+
535+ current_sub_device_id = 0 ;
369536 _transfer_bw_shared_memory (context, local_memory_size, local_memory);
370537
371538 if (context.sub_device_count ) {
0 commit comments