Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions perf_tests/ze_peak/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ ze_peak measures the following:
* Double Precision Compute in GigaFlops
* Integer Compute in GigaInteger Flops
* Memory Transfer Bandwidth in GigaBytes Per Second
* GPU Copy Shared System Memory <-> Device Memory
* GPU Copy Host <-> Shared Memory
* System Memory Copy Host <-> Shared Memory
* Kernel Launch Latency in micro seconds
Expand Down
8 changes: 8 additions & 0 deletions perf_tests/ze_peak/include/ze_peak.h
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ struct L0Context {
nullptr};
ze_device_compute_properties_t device_compute_property = {
ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES, nullptr};
ze_device_memory_access_properties_t device_memory_access_property = {
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES, nullptr};
bool verbose = false;
std::vector<ze_command_queue_group_properties_t> queueProperties;

Expand Down Expand Up @@ -155,9 +157,15 @@ class ZePeak {
#endif

private:
enum class MemoryAdvice { None, SourceToSystem, DestinationToSystem };

long double _transfer_bw_gpu_copy(L0Context &context,
void *destination_buffer,
void *source_buffer, size_t buffer_size);
long double _transfer_bw_gpu_copy_with_shared_system(
L0Context &context, void *destination_buffer, void *source_buffer,
void *input_buffer, size_t buffer_size, bool is_source_system,
MemoryAdvice advice = MemoryAdvice::None);
long double _transfer_bw_host_copy(L0Context &context,
void *destination_buffer,
void *source_buffer, size_t buffer_size,
Expand Down
166 changes: 165 additions & 1 deletion perf_tests/ze_peak/src/transfer_bw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
#include "../include/ze_peak.h"
#include "../../common/include/common.hpp"

#include <memory>

long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
void *destination_buffer,
void *source_buffer,
Expand Down Expand Up @@ -53,6 +55,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
return calculate_gbps(timed, static_cast<long double>(buffer_size));
}

long double ZePeak::_transfer_bw_gpu_copy_with_shared_system(
L0Context &context, void *destination_buffer, void *source_buffer,
void *input_buffer, size_t buffer_size, bool is_source_system,
MemoryAdvice advice) {
Timer<std::chrono::nanoseconds::period> timer;
Timer<std::chrono::nanoseconds::period> host_timer;
long double gbps = 0;
ze_result_t result = ZE_RESULT_SUCCESS;

auto cmd_l = context.command_list;
auto cmd_q = context.command_queue;
auto device = context.device;

if (context.copy_command_queue) {
cmd_l = context.copy_command_list;
cmd_q = context.copy_command_queue;
} else if (context.sub_device_count) {
cmd_l = context.cmd_list[current_sub_device_id];
cmd_q = context.cmd_queue[current_sub_device_id];
device = context.sub_devices[current_sub_device_id];
}

SUCCESS_OR_TERMINATE(zeCommandListReset(cmd_l));
switch (advice) {
case MemoryAdvice::SourceToSystem:
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
cmd_l, device, source_buffer, buffer_size,
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
break;
case MemoryAdvice::DestinationToSystem:
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
cmd_l, device, destination_buffer, buffer_size,
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
break;
}
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmd_l, destination_buffer,
source_buffer, buffer_size,
nullptr, 0, nullptr));
SUCCESS_OR_TERMINATE(zeCommandListClose(cmd_l));

for (uint32_t i = 0; i < warmup_iterations; i++) {
SUCCESS_OR_TERMINATE(
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));
}

timer.start();
for (uint32_t i = 0; i < iters; i++) {
SUCCESS_OR_TERMINATE(
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));

host_timer.start();
if (is_source_system) {
memset(source_buffer, 0, buffer_size);
} else {
memset(destination_buffer, 0, buffer_size);
}
host_timer.end();
}
timer.end();
long double timed =
timer.period_minus_overhead() - host_timer.period_minus_overhead();
timed /= static_cast<long double>(iters);

switch (advice) {
case MemoryAdvice::SourceToSystem:
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
cmd_l, device, source_buffer, buffer_size,
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
break;
case MemoryAdvice::DestinationToSystem:
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
cmd_l, device, destination_buffer, buffer_size,
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
break;
}

return calculate_gbps(timed, static_cast<long double>(buffer_size));
}

long double ZePeak::_transfer_bw_host_copy(L0Context &context,
void *destination_buffer,
void *source_buffer,
Expand Down Expand Up @@ -364,8 +447,89 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
std::cout << "enqueueReadBuffer : ";
std::cout << gflops << " GB/s\n";

current_sub_device_id = 0;
if ((context.device_memory_access_property.sharedSystemAllocCapabilities &
ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0) {
auto system_memory = std::make_unique<uint8_t[]>(local_memory_size);
memcpy(system_memory.get(), local_memory, local_memory_size);

gflops = 0;
if (context.sub_device_count) {
current_sub_device_id = 0;
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
gflops += _transfer_bw_gpu_copy_with_shared_system(
context, dev_out_buf[i], system_memory.get(), local_memory,
local_memory_size / context.sub_device_count, true);
current_sub_device_id++;
}
gflops = gflops / context.sub_device_count;
} else {
gflops = _transfer_bw_gpu_copy_with_shared_system(
context, device_buffer, system_memory.get(), local_memory,
local_memory_size, true);
}
std::cout << "GPU Copy Shared System Memory to Shared Memory : ";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
current_sub_device_id = 0;
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
gflops += _transfer_bw_gpu_copy_with_shared_system(
context, system_memory.get(), dev_out_buf[i], local_memory,
local_memory_size / context.sub_device_count, false);
current_sub_device_id++;
}
gflops = gflops / context.sub_device_count;
} else {
gflops = _transfer_bw_gpu_copy_with_shared_system(
context, system_memory.get(), device_buffer, local_memory,
local_memory_size, false);
}
std::cout << "GPU Copy Shared System Memory from Shared Memory : ";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
current_sub_device_id = 0;
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
gflops += _transfer_bw_gpu_copy_with_shared_system(
context, dev_out_buf[i], system_memory.get(), local_memory,
local_memory_size / context.sub_device_count, true,
MemoryAdvice::SourceToSystem);
current_sub_device_id++;
}
gflops = gflops / context.sub_device_count;
} else {
gflops = _transfer_bw_gpu_copy_with_shared_system(
context, device_buffer, system_memory.get(), local_memory,
local_memory_size, true, MemoryAdvice::SourceToSystem);
}
std::cout << "GPU Copy Shared System Memory to Shared Memory with Memory "
"Advice : ";
std::cout << gflops << " GB/s\n";

gflops = 0;
if (context.sub_device_count) {
current_sub_device_id = 0;
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
gflops += _transfer_bw_gpu_copy_with_shared_system(
context, system_memory.get(), dev_out_buf[i], local_memory,
local_memory_size / context.sub_device_count, false,
MemoryAdvice::DestinationToSystem);
current_sub_device_id++;
}
gflops = gflops / context.sub_device_count;
} else {
gflops = _transfer_bw_gpu_copy_with_shared_system(
context, system_memory.get(), device_buffer, local_memory,
local_memory_size, false, MemoryAdvice::DestinationToSystem);
}
std::cout << "GPU Copy Shared System Memory from Shared Memory with Memory "
"Advice : ";
std::cout << gflops << " GB/s\n";
}

current_sub_device_id = 0;
_transfer_bw_shared_memory(context, local_memory_size, local_memory);

if (context.sub_device_count) {
Expand Down
8 changes: 8 additions & 0 deletions perf_tests/ze_peak/src/ze_peak.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,14 @@ void L0Context::init_xe(uint32_t specified_driver, uint32_t specified_device,
if (verbose)
std::cout << "Device Compute Properties retrieved\n";

device_memory_access_property.stype =
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
device_memory_access_property.pNext = nullptr;
SUCCESS_OR_TERMINATE(zeDeviceGetMemoryAccessProperties(
device, &device_memory_access_property));
if (verbose)
std::cout << "Device Memory Access Properties retrieved\n";

zeDeviceGetSubDevices(device, &sub_device_count, nullptr);
if (verbose)
std::cout << "Sub Device Count retrieved\n";
Expand Down
Loading