|
9 | 9 | #include "../include/ze_peak.h" |
10 | 10 | #include "../../common/include/common.hpp" |
11 | 11 |
|
| 12 | +#include <memory> |
| 13 | + |
12 | 14 | long double ZePeak::_transfer_bw_gpu_copy(L0Context &context, |
13 | 15 | void *destination_buffer, |
14 | 16 | void *source_buffer, |
@@ -53,6 +55,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context, |
53 | 55 | return calculate_gbps(timed, static_cast<long double>(buffer_size)); |
54 | 56 | } |
55 | 57 |
|
| 58 | +long double ZePeak::_transfer_bw_gpu_copy_with_shared_system( |
| 59 | + L0Context &context, void *destination_buffer, void *source_buffer, |
| 60 | + void *input_buffer, size_t buffer_size, bool is_source_system, |
| 61 | + MemoryAdvice advice) { |
| 62 | + Timer<std::chrono::nanoseconds::period> timer; |
| 63 | + Timer<std::chrono::nanoseconds::period> host_timer; |
| 64 | + long double gbps = 0; |
| 65 | + ze_result_t result = ZE_RESULT_SUCCESS; |
| 66 | + |
| 67 | + auto cmd_l = context.command_list; |
| 68 | + auto cmd_q = context.command_queue; |
| 69 | + auto device = context.device; |
| 70 | + |
| 71 | + if (context.copy_command_queue) { |
| 72 | + cmd_l = context.copy_command_list; |
| 73 | + cmd_q = context.copy_command_queue; |
| 74 | + } else if (context.sub_device_count) { |
| 75 | + cmd_l = context.cmd_list[current_sub_device_id]; |
| 76 | + cmd_q = context.cmd_queue[current_sub_device_id]; |
| 77 | + device = context.sub_devices[current_sub_device_id]; |
| 78 | + } |
| 79 | + |
| 80 | + SUCCESS_OR_TERMINATE(zeCommandListReset(cmd_l)); |
| 81 | + switch (advice) { |
| 82 | + case MemoryAdvice::SourceToSystem: |
| 83 | + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( |
| 84 | + cmd_l, device, source_buffer, buffer_size, |
| 85 | + ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION)); |
| 86 | + break; |
| 87 | + case MemoryAdvice::DestinationToSystem: |
| 88 | + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( |
| 89 | + cmd_l, device, destination_buffer, buffer_size, |
| 90 | + ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION)); |
| 91 | + break; |
| 92 | + } |
| 93 | + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmd_l, destination_buffer, |
| 94 | + source_buffer, buffer_size, |
| 95 | + nullptr, 0, nullptr)); |
| 96 | + SUCCESS_OR_TERMINATE(zeCommandListClose(cmd_l)); |
| 97 | + |
| 98 | + for (uint32_t i = 0; i < warmup_iterations; i++) { |
| 99 | + SUCCESS_OR_TERMINATE( |
| 100 | + zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr)); |
| 101 | + SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX)); |
| 102 | + } |
| 103 | + |
| 104 | + timer.start(); |
| 105 | + for (uint32_t i = 0; i < iters; i++) { |
| 106 | + SUCCESS_OR_TERMINATE( |
| 107 | + zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr)); |
| 108 | + SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX)); |
| 109 | + |
| 110 | + host_timer.start(); |
| 111 | + if (is_source_system) { |
| 112 | + memset(source_buffer, 0, buffer_size); |
| 113 | + } else { |
| 114 | + memset(destination_buffer, 0, buffer_size); |
| 115 | + } |
| 116 | + host_timer.end(); |
| 117 | + } |
| 118 | + timer.end(); |
| 119 | + long double timed = |
| 120 | + timer.period_minus_overhead() - host_timer.period_minus_overhead(); |
| 121 | + timed /= static_cast<long double>(iters); |
| 122 | + |
| 123 | + switch (advice) { |
| 124 | + case MemoryAdvice::SourceToSystem: |
| 125 | + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( |
| 126 | + cmd_l, device, source_buffer, buffer_size, |
| 127 | + ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION)); |
| 128 | + break; |
| 129 | + case MemoryAdvice::DestinationToSystem: |
| 130 | + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( |
| 131 | + cmd_l, device, destination_buffer, buffer_size, |
| 132 | + ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION)); |
| 133 | + break; |
| 134 | + } |
| 135 | + |
| 136 | + return calculate_gbps(timed, static_cast<long double>(buffer_size)); |
| 137 | +} |
| 138 | + |
56 | 139 | long double ZePeak::_transfer_bw_host_copy(L0Context &context, |
57 | 140 | void *destination_buffer, |
58 | 141 | void *source_buffer, |
@@ -364,8 +447,89 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) { |
364 | 447 | std::cout << "enqueueReadBuffer : "; |
365 | 448 | std::cout << gflops << " GB/s\n"; |
366 | 449 |
|
367 | | - current_sub_device_id = 0; |
| 450 | + if ((context.device_memory_access_property.sharedSystemAllocCapabilities & |
| 451 | + ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0) { |
| 452 | + auto system_memory = std::make_unique<uint8_t[]>(local_memory_size); |
| 453 | + memcpy(system_memory.get(), local_memory, local_memory_size); |
| 454 | + |
| 455 | + gflops = 0; |
| 456 | + if (context.sub_device_count) { |
| 457 | + current_sub_device_id = 0; |
| 458 | + for (uint32_t i = 0U; i < context.sub_device_count; i++) { |
| 459 | + gflops += _transfer_bw_gpu_copy_with_shared_system( |
| 460 | + context, dev_out_buf[i], system_memory.get(), local_memory, |
| 461 | + local_memory_size / context.sub_device_count, true); |
| 462 | + current_sub_device_id++; |
| 463 | + } |
| 464 | + gflops = gflops / context.sub_device_count; |
| 465 | + } else { |
| 466 | + gflops = _transfer_bw_gpu_copy_with_shared_system( |
| 467 | + context, device_buffer, system_memory.get(), local_memory, |
| 468 | + local_memory_size, true); |
| 469 | + } |
| 470 | + std::cout << "GPU Copy Shared System Memory to Shared Memory : "; |
| 471 | + std::cout << gflops << " GB/s\n"; |
| 472 | + |
| 473 | + gflops = 0; |
| 474 | + if (context.sub_device_count) { |
| 475 | + current_sub_device_id = 0; |
| 476 | + for (uint32_t i = 0U; i < context.sub_device_count; i++) { |
| 477 | + gflops += _transfer_bw_gpu_copy_with_shared_system( |
| 478 | + context, system_memory.get(), dev_out_buf[i], local_memory, |
| 479 | + local_memory_size / context.sub_device_count, false); |
| 480 | + current_sub_device_id++; |
| 481 | + } |
| 482 | + gflops = gflops / context.sub_device_count; |
| 483 | + } else { |
| 484 | + gflops = _transfer_bw_gpu_copy_with_shared_system( |
| 485 | + context, system_memory.get(), device_buffer, local_memory, |
| 486 | + local_memory_size, false); |
| 487 | + } |
| 488 | + std::cout << "GPU Copy Shared System Memory from Shared Memory : "; |
| 489 | + std::cout << gflops << " GB/s\n"; |
| 490 | + |
| 491 | + gflops = 0; |
| 492 | + if (context.sub_device_count) { |
| 493 | + current_sub_device_id = 0; |
| 494 | + for (uint32_t i = 0U; i < context.sub_device_count; i++) { |
| 495 | + gflops += _transfer_bw_gpu_copy_with_shared_system( |
| 496 | + context, dev_out_buf[i], system_memory.get(), local_memory, |
| 497 | + local_memory_size / context.sub_device_count, true, |
| 498 | + MemoryAdvice::SourceToSystem); |
| 499 | + current_sub_device_id++; |
| 500 | + } |
| 501 | + gflops = gflops / context.sub_device_count; |
| 502 | + } else { |
| 503 | + gflops = _transfer_bw_gpu_copy_with_shared_system( |
| 504 | + context, device_buffer, system_memory.get(), local_memory, |
| 505 | + local_memory_size, true, MemoryAdvice::SourceToSystem); |
| 506 | + } |
| 507 | + std::cout << "GPU Copy Shared System Memory to Shared Memory with Memory " |
| 508 | + "Advice : "; |
| 509 | + std::cout << gflops << " GB/s\n"; |
| 510 | + |
| 511 | + gflops = 0; |
| 512 | + if (context.sub_device_count) { |
| 513 | + current_sub_device_id = 0; |
| 514 | + for (uint32_t i = 0U; i < context.sub_device_count; i++) { |
| 515 | + gflops += _transfer_bw_gpu_copy_with_shared_system( |
| 516 | + context, system_memory.get(), dev_out_buf[i], local_memory, |
| 517 | + local_memory_size / context.sub_device_count, false, |
| 518 | + MemoryAdvice::DestinationToSystem); |
| 519 | + current_sub_device_id++; |
| 520 | + } |
| 521 | + gflops = gflops / context.sub_device_count; |
| 522 | + } else { |
| 523 | + gflops = _transfer_bw_gpu_copy_with_shared_system( |
| 524 | + context, system_memory.get(), device_buffer, local_memory, |
| 525 | + local_memory_size, false, MemoryAdvice::DestinationToSystem); |
| 526 | + } |
| 527 | + std::cout << "GPU Copy Shared System Memory from Shared Memory with Memory " |
| 528 | + "Advice : "; |
| 529 | + std::cout << gflops << " GB/s\n"; |
| 530 | + } |
368 | 531 |
|
| 532 | + current_sub_device_id = 0; |
369 | 533 | _transfer_bw_shared_memory(context, local_memory_size, local_memory); |
370 | 534 |
|
371 | 535 | if (context.sub_device_count) { |
|
0 commit comments