Skip to content

Commit 834a35e

Browse files
feat(ze_peak): Add Shared System Memory transfer bandwidth tests
Signed-off-by: Misiak, Konstanty <[email protected]>
1 parent a0e59be commit 834a35e

File tree

4 files changed

+182
-1
lines changed

4 files changed

+182
-1
lines changed

perf_tests/ze_peak/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ ze_peak measures the following:
1515
* Double Precision Compute in GigaFlops
1616
* Integer Compute in GigaInteger Flops
1717
* Memory Transfer Bandwidth in GigaBytes Per Second
18+
* GPU Copy Shared System Memory <-> Device Memory
1819
* GPU Copy Host <-> Shared Memory
1920
* System Memory Copy Host <-> Shared Memory
2021
* Kernel Launch Latency in micro seconds

perf_tests/ze_peak/include/ze_peak.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ struct L0Context {
7575
nullptr};
7676
ze_device_compute_properties_t device_compute_property = {
7777
ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES, nullptr};
78+
ze_device_memory_access_properties_t device_memory_access_property = {
79+
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES, nullptr};
7880
bool verbose = false;
7981
std::vector<ze_command_queue_group_properties_t> queueProperties;
8082

@@ -155,9 +157,15 @@ class ZePeak {
155157
#endif
156158

157159
private:
160+
enum class MemoryAdvice { None, SourceToSystem, DestinationToSystem };
161+
158162
long double _transfer_bw_gpu_copy(L0Context &context,
159163
void *destination_buffer,
160164
void *source_buffer, size_t buffer_size);
165+
long double _transfer_bw_gpu_copy_with_shared_system(
166+
L0Context &context, void *destination_buffer, void *source_buffer,
167+
void *input_buffer, size_t buffer_size, bool is_source_system,
168+
MemoryAdvice advice = MemoryAdvice::None);
161169
long double _transfer_bw_host_copy(L0Context &context,
162170
void *destination_buffer,
163171
void *source_buffer, size_t buffer_size,

perf_tests/ze_peak/src/transfer_bw.cpp

Lines changed: 165 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#include "../include/ze_peak.h"
1010
#include "../../common/include/common.hpp"
1111

12+
#include <memory>
13+
1214
long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
1315
void *destination_buffer,
1416
void *source_buffer,
@@ -53,6 +55,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
5355
return calculate_gbps(timed, static_cast<long double>(buffer_size));
5456
}
5557

58+
long double ZePeak::_transfer_bw_gpu_copy_with_shared_system(
59+
L0Context &context, void *destination_buffer, void *source_buffer,
60+
void *input_buffer, size_t buffer_size, bool is_source_system,
61+
MemoryAdvice advice) {
62+
Timer<std::chrono::nanoseconds::period> timer;
63+
Timer<std::chrono::nanoseconds::period> host_timer;
64+
long double gbps = 0;
65+
ze_result_t result = ZE_RESULT_SUCCESS;
66+
67+
auto cmd_l = context.command_list;
68+
auto cmd_q = context.command_queue;
69+
auto device = context.device;
70+
71+
if (context.copy_command_queue) {
72+
cmd_l = context.copy_command_list;
73+
cmd_q = context.copy_command_queue;
74+
} else if (context.sub_device_count) {
75+
cmd_l = context.cmd_list[current_sub_device_id];
76+
cmd_q = context.cmd_queue[current_sub_device_id];
77+
device = context.sub_devices[current_sub_device_id];
78+
}
79+
80+
SUCCESS_OR_TERMINATE(zeCommandListReset(cmd_l));
81+
switch (advice) {
82+
case MemoryAdvice::SourceToSystem:
83+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
84+
cmd_l, device, source_buffer, buffer_size,
85+
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
86+
break;
87+
case MemoryAdvice::DestinationToSystem:
88+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
89+
cmd_l, device, destination_buffer, buffer_size,
90+
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
91+
break;
92+
}
93+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmd_l, destination_buffer,
94+
source_buffer, buffer_size,
95+
nullptr, 0, nullptr));
96+
SUCCESS_OR_TERMINATE(zeCommandListClose(cmd_l));
97+
98+
for (uint32_t i = 0; i < warmup_iterations; i++) {
99+
SUCCESS_OR_TERMINATE(
100+
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
101+
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));
102+
}
103+
104+
timer.start();
105+
for (uint32_t i = 0; i < iters; i++) {
106+
SUCCESS_OR_TERMINATE(
107+
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
108+
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));
109+
110+
host_timer.start();
111+
if (is_source_system) {
112+
memset(source_buffer, 0, buffer_size);
113+
} else {
114+
memset(destination_buffer, 0, buffer_size);
115+
}
116+
host_timer.end();
117+
}
118+
timer.end();
119+
long double timed =
120+
timer.period_minus_overhead() - host_timer.period_minus_overhead();
121+
timed /= static_cast<long double>(iters);
122+
123+
switch (advice) {
124+
case MemoryAdvice::SourceToSystem:
125+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
126+
cmd_l, device, source_buffer, buffer_size,
127+
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
128+
break;
129+
case MemoryAdvice::DestinationToSystem:
130+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
131+
cmd_l, device, destination_buffer, buffer_size,
132+
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
133+
break;
134+
}
135+
136+
return calculate_gbps(timed, static_cast<long double>(buffer_size));
137+
}
138+
56139
long double ZePeak::_transfer_bw_host_copy(L0Context &context,
57140
void *destination_buffer,
58141
void *source_buffer,
@@ -364,8 +447,89 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
364447
std::cout << "enqueueReadBuffer : ";
365448
std::cout << gflops << " GB/s\n";
366449

367-
current_sub_device_id = 0;
450+
if ((context.device_memory_access_property.sharedSystemAllocCapabilities &
451+
ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0) {
452+
auto system_memory = std::make_unique<uint8_t[]>(local_memory_size);
453+
memcpy(system_memory.get(), local_memory, local_memory_size);
454+
455+
gflops = 0;
456+
if (context.sub_device_count) {
457+
current_sub_device_id = 0;
458+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
459+
gflops += _transfer_bw_gpu_copy_with_shared_system(
460+
context, dev_out_buf[i], system_memory.get(), local_memory,
461+
local_memory_size / context.sub_device_count, true);
462+
current_sub_device_id++;
463+
}
464+
gflops = gflops / context.sub_device_count;
465+
} else {
466+
gflops = _transfer_bw_gpu_copy_with_shared_system(
467+
context, device_buffer, system_memory.get(), local_memory,
468+
local_memory_size, true);
469+
}
470+
std::cout << "GPU Copy Shared System Memory to Shared Memory : ";
471+
std::cout << gflops << " GB/s\n";
472+
473+
gflops = 0;
474+
if (context.sub_device_count) {
475+
current_sub_device_id = 0;
476+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
477+
gflops += _transfer_bw_gpu_copy_with_shared_system(
478+
context, system_memory.get(), dev_out_buf[i], local_memory,
479+
local_memory_size / context.sub_device_count, false);
480+
current_sub_device_id++;
481+
}
482+
gflops = gflops / context.sub_device_count;
483+
} else {
484+
gflops = _transfer_bw_gpu_copy_with_shared_system(
485+
context, system_memory.get(), device_buffer, local_memory,
486+
local_memory_size, false);
487+
}
488+
std::cout << "GPU Copy Shared System Memory from Shared Memory : ";
489+
std::cout << gflops << " GB/s\n";
490+
491+
gflops = 0;
492+
if (context.sub_device_count) {
493+
current_sub_device_id = 0;
494+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
495+
gflops += _transfer_bw_gpu_copy_with_shared_system(
496+
context, dev_out_buf[i], system_memory.get(), local_memory,
497+
local_memory_size / context.sub_device_count, true,
498+
MemoryAdvice::SourceToSystem);
499+
current_sub_device_id++;
500+
}
501+
gflops = gflops / context.sub_device_count;
502+
} else {
503+
gflops = _transfer_bw_gpu_copy_with_shared_system(
504+
context, device_buffer, system_memory.get(), local_memory,
505+
local_memory_size, true, MemoryAdvice::SourceToSystem);
506+
}
507+
std::cout << "GPU Copy Shared System Memory to Shared Memory with Memory "
508+
"Advice : ";
509+
std::cout << gflops << " GB/s\n";
510+
511+
gflops = 0;
512+
if (context.sub_device_count) {
513+
current_sub_device_id = 0;
514+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
515+
gflops += _transfer_bw_gpu_copy_with_shared_system(
516+
context, system_memory.get(), dev_out_buf[i], local_memory,
517+
local_memory_size / context.sub_device_count, false,
518+
MemoryAdvice::DestinationToSystem);
519+
current_sub_device_id++;
520+
}
521+
gflops = gflops / context.sub_device_count;
522+
} else {
523+
gflops = _transfer_bw_gpu_copy_with_shared_system(
524+
context, system_memory.get(), device_buffer, local_memory,
525+
local_memory_size, false, MemoryAdvice::DestinationToSystem);
526+
}
527+
std::cout << "GPU Copy Shared System Memory from Shared Memory with Memory "
528+
"Advice : ";
529+
std::cout << gflops << " GB/s\n";
530+
}
368531

532+
current_sub_device_id = 0;
369533
_transfer_bw_shared_memory(context, local_memory_size, local_memory);
370534

371535
if (context.sub_device_count) {

perf_tests/ze_peak/src/ze_peak.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,14 @@ void L0Context::init_xe(uint32_t specified_driver, uint32_t specified_device,
320320
if (verbose)
321321
std::cout << "Device Compute Properties retrieved\n";
322322

323+
device_memory_access_property.stype =
324+
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
325+
device_memory_access_property.pNext = nullptr;
326+
SUCCESS_OR_TERMINATE(zeDeviceGetMemoryAccessProperties(
327+
device, &device_memory_access_property));
328+
if (verbose)
329+
std::cout << "Device Memory Access Properties retrieved\n";
330+
323331
zeDeviceGetSubDevices(device, &sub_device_count, nullptr);
324332
if (verbose)
325333
std::cout << "Sub Device Count retrieved\n";

0 commit comments

Comments
 (0)