diff --git a/perf_tests/ze_peak/README.md b/perf_tests/ze_peak/README.md index 99aafb5f..c66f5cc8 100644 --- a/perf_tests/ze_peak/README.md +++ b/perf_tests/ze_peak/README.md @@ -15,6 +15,7 @@ ze_peak measures the following: * Double Precision Compute in GigaFlops * Integer Compute in GigaInteger Flops * Memory Transfer Bandwidth in GigaBytes Per Second + * GPU Copy Shared System Memory <-> Device Memory * GPU Copy Host <-> Shared Memory * System Memory Copy Host <-> Shared Memory * Kernel Launch Latency in micro seconds diff --git a/perf_tests/ze_peak/include/ze_peak.h b/perf_tests/ze_peak/include/ze_peak.h index 6d48e17b..b6e2de7b 100644 --- a/perf_tests/ze_peak/include/ze_peak.h +++ b/perf_tests/ze_peak/include/ze_peak.h @@ -75,6 +75,8 @@ struct L0Context { nullptr}; ze_device_compute_properties_t device_compute_property = { ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES, nullptr}; + ze_device_memory_access_properties_t device_memory_access_property = { + ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES, nullptr}; bool verbose = false; std::vector queueProperties; @@ -155,9 +157,15 @@ class ZePeak { #endif private: + enum class MemoryAdvice { None, SourceToSystem, DestinationToSystem }; + long double _transfer_bw_gpu_copy(L0Context &context, void *destination_buffer, void *source_buffer, size_t buffer_size); + long double _transfer_bw_gpu_copy_with_shared_system( + L0Context &context, void *destination_buffer, void *source_buffer, + void *input_buffer, size_t buffer_size, bool is_source_system, + MemoryAdvice advice = MemoryAdvice::None); long double _transfer_bw_host_copy(L0Context &context, void *destination_buffer, void *source_buffer, size_t buffer_size, diff --git a/perf_tests/ze_peak/src/transfer_bw.cpp b/perf_tests/ze_peak/src/transfer_bw.cpp index 797f5cc6..697950b4 100644 --- a/perf_tests/ze_peak/src/transfer_bw.cpp +++ b/perf_tests/ze_peak/src/transfer_bw.cpp @@ -9,6 +9,8 @@ #include "../include/ze_peak.h" #include "../../common/include/common.hpp" +#include + long double ZePeak::_transfer_bw_gpu_copy(L0Context &context, void *destination_buffer, void *source_buffer, @@ -53,6 +55,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context, return calculate_gbps(timed, static_cast(buffer_size)); } +long double ZePeak::_transfer_bw_gpu_copy_with_shared_system( + L0Context &context, void *destination_buffer, void *source_buffer, + void *input_buffer, size_t buffer_size, bool is_source_system, + MemoryAdvice advice) { + Timer timer; + Timer host_timer; + long double gbps = 0; + ze_result_t result = ZE_RESULT_SUCCESS; + + auto cmd_l = context.command_list; + auto cmd_q = context.command_queue; + auto device = context.device; + + if (context.copy_command_queue) { + cmd_l = context.copy_command_list; + cmd_q = context.copy_command_queue; + } else if (context.sub_device_count) { + cmd_l = context.cmd_list[current_sub_device_id]; + cmd_q = context.cmd_queue[current_sub_device_id]; + device = context.sub_devices[current_sub_device_id]; + } + + SUCCESS_OR_TERMINATE(zeCommandListReset(cmd_l)); + switch (advice) { + case MemoryAdvice::SourceToSystem: + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( + cmd_l, device, source_buffer, buffer_size, + ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION)); + break; + case MemoryAdvice::DestinationToSystem: + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( + cmd_l, device, destination_buffer, buffer_size, + ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION)); + break; + } + SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmd_l, destination_buffer, + source_buffer, buffer_size, + nullptr, 0, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandListClose(cmd_l)); + + for (uint32_t i = 0; i < warmup_iterations; i++) { + SUCCESS_OR_TERMINATE( + zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX)); + } + + timer.start(); + for (uint32_t i = 0; i < iters; i++) { + SUCCESS_OR_TERMINATE( + zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr)); + SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX)); + + host_timer.start(); + if (is_source_system) { + memset(source_buffer, 0, buffer_size); + } else { + memset(destination_buffer, 0, buffer_size); + } + host_timer.end(); + } + timer.end(); + long double timed = + timer.period_minus_overhead() - host_timer.period_minus_overhead(); + timed /= static_cast(iters); + + switch (advice) { + case MemoryAdvice::SourceToSystem: + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( + cmd_l, device, source_buffer, buffer_size, + ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION)); + break; + case MemoryAdvice::DestinationToSystem: + SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise( + cmd_l, device, destination_buffer, buffer_size, + ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION)); + break; + } + + return calculate_gbps(timed, static_cast(buffer_size)); +} + long double ZePeak::_transfer_bw_host_copy(L0Context &context, void *destination_buffer, void *source_buffer, @@ -364,8 +447,89 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) { std::cout << "enqueueReadBuffer : "; std::cout << gflops << " GB/s\n"; - current_sub_device_id = 0; + if ((context.device_memory_access_property.sharedSystemAllocCapabilities & + ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0) { + auto system_memory = std::make_unique(local_memory_size); + memcpy(system_memory.get(), local_memory, local_memory_size); + + gflops = 0; + if (context.sub_device_count) { + current_sub_device_id = 0; + for (uint32_t i = 0U; i < context.sub_device_count; i++) { + gflops += _transfer_bw_gpu_copy_with_shared_system( + context, dev_out_buf[i], system_memory.get(), local_memory, + local_memory_size / context.sub_device_count, true); + current_sub_device_id++; + } + gflops = gflops / context.sub_device_count; + } else { + gflops = _transfer_bw_gpu_copy_with_shared_system( + context, device_buffer, system_memory.get(), local_memory, + local_memory_size, true); + } + std::cout << "GPU Copy Shared System Memory to Shared Memory : "; + std::cout << gflops << " GB/s\n"; + + gflops = 0; + if (context.sub_device_count) { + current_sub_device_id = 0; + for (uint32_t i = 0U; i < context.sub_device_count; i++) { + gflops += _transfer_bw_gpu_copy_with_shared_system( + context, system_memory.get(), dev_out_buf[i], local_memory, + local_memory_size / context.sub_device_count, false); + current_sub_device_id++; + } + gflops = gflops / context.sub_device_count; + } else { + gflops = _transfer_bw_gpu_copy_with_shared_system( + context, system_memory.get(), device_buffer, local_memory, + local_memory_size, false); + } + std::cout << "GPU Copy Shared System Memory from Shared Memory : "; + std::cout << gflops << " GB/s\n"; + + gflops = 0; + if (context.sub_device_count) { + current_sub_device_id = 0; + for (uint32_t i = 0U; i < context.sub_device_count; i++) { + gflops += _transfer_bw_gpu_copy_with_shared_system( + context, dev_out_buf[i], system_memory.get(), local_memory, + local_memory_size / context.sub_device_count, true, + MemoryAdvice::SourceToSystem); + current_sub_device_id++; + } + gflops = gflops / context.sub_device_count; + } else { + gflops = _transfer_bw_gpu_copy_with_shared_system( + context, device_buffer, system_memory.get(), local_memory, + local_memory_size, true, MemoryAdvice::SourceToSystem); + } + std::cout << "GPU Copy Shared System Memory to Shared Memory with Memory " + "Advice : "; + std::cout << gflops << " GB/s\n"; + + gflops = 0; + if (context.sub_device_count) { + current_sub_device_id = 0; + for (uint32_t i = 0U; i < context.sub_device_count; i++) { + gflops += _transfer_bw_gpu_copy_with_shared_system( + context, system_memory.get(), dev_out_buf[i], local_memory, + local_memory_size / context.sub_device_count, false, + MemoryAdvice::DestinationToSystem); + current_sub_device_id++; + } + gflops = gflops / context.sub_device_count; + } else { + gflops = _transfer_bw_gpu_copy_with_shared_system( + context, system_memory.get(), device_buffer, local_memory, + local_memory_size, false, MemoryAdvice::DestinationToSystem); + } + std::cout << "GPU Copy Shared System Memory from Shared Memory with Memory " + "Advice : "; + std::cout << gflops << " GB/s\n"; + } + current_sub_device_id = 0; _transfer_bw_shared_memory(context, local_memory_size, local_memory); if (context.sub_device_count) { diff --git a/perf_tests/ze_peak/src/ze_peak.cpp b/perf_tests/ze_peak/src/ze_peak.cpp index acc8f7f3..82727d4e 100644 --- a/perf_tests/ze_peak/src/ze_peak.cpp +++ b/perf_tests/ze_peak/src/ze_peak.cpp @@ -320,6 +320,14 @@ void L0Context::init_xe(uint32_t specified_driver, uint32_t specified_device, if (verbose) std::cout << "Device Compute Properties retrieved\n"; + device_memory_access_property.stype = + ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES; + device_memory_access_property.pNext = nullptr; + SUCCESS_OR_TERMINATE(zeDeviceGetMemoryAccessProperties( + device, &device_memory_access_property)); + if (verbose) + std::cout << "Device Memory Access Properties retrieved\n"; + zeDeviceGetSubDevices(device, &sub_device_count, nullptr); if (verbose) std::cout << "Sub Device Count retrieved\n";