Skip to content

Commit c6e3fbc

Browse files
feat(ze_peak): Add Shared System Memory transfer bandwidth tests
Signed-off-by: Misiak, Konstanty <[email protected]>
1 parent f42c89e commit c6e3fbc

File tree

4 files changed

+191
-1
lines changed

4 files changed

+191
-1
lines changed

perf_tests/ze_peak/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ ze_peak measures the following:
1515
* Double Precision Compute in GigaFlops
1616
* Integer Compute in GigaInteger Flops
1717
* Memory Transfer Bandwidth in GigaBytes Per Second
18+
* GPU Copy Shared System Memory <-> Device Memory
1819
* GPU Copy Host <-> Shared Memory
1920
* System Memory Copy Host <-> Shared Memory
2021
* Kernel Launch Latency in micro seconds

perf_tests/ze_peak/include/ze_peak.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ struct L0Context {
7575
nullptr};
7676
ze_device_compute_properties_t device_compute_property = {
7777
ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES, nullptr};
78+
ze_device_memory_access_properties_t device_memory_access_property = {
79+
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES, nullptr};
7880
bool verbose = false;
7981
std::vector<ze_command_queue_group_properties_t> queueProperties;
8082

@@ -155,9 +157,15 @@ class ZePeak {
155157
#endif
156158

157159
private:
160+
enum class MemoryAdvice { None, SourceToSystem, DestinationToSystem };
161+
158162
long double _transfer_bw_gpu_copy(L0Context &context,
159163
void *destination_buffer,
160164
void *source_buffer, size_t buffer_size);
165+
long double _transfer_bw_gpu_copy_with_shared_system(
166+
L0Context &context, void *destination_buffer, void *source_buffer,
167+
void *input_buffer, size_t buffer_size, bool is_source_system,
168+
MemoryAdvice advice = MemoryAdvice::None);
161169
long double _transfer_bw_host_copy(L0Context &context,
162170
void *destination_buffer,
163171
void *source_buffer, size_t buffer_size,

perf_tests/ze_peak/src/transfer_bw.cpp

Lines changed: 170 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,15 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
1919

2020
auto cmd_l = context.command_list;
2121
auto cmd_q = context.command_queue;
22+
auto device = context.device;
2223

2324
if (context.copy_command_queue) {
2425
cmd_l = context.copy_command_list;
2526
cmd_q = context.copy_command_queue;
2627
} else if (context.sub_device_count) {
2728
cmd_l = context.cmd_list[current_sub_device_id];
2829
cmd_q = context.cmd_queue[current_sub_device_id];
30+
device = context.sub_devices[current_sub_device_id];
2931
}
3032

3133
SUCCESS_OR_TERMINATE(zeCommandListReset(cmd_l));
@@ -53,6 +55,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
5355
return calculate_gbps(timed, static_cast<long double>(buffer_size));
5456
}
5557

58+
long double ZePeak::_transfer_bw_gpu_copy_with_shared_system(
59+
L0Context &context, void *destination_buffer, void *source_buffer,
60+
void *input_buffer, size_t buffer_size, bool is_source_system,
61+
MemoryAdvice advice) {
62+
Timer<std::chrono::nanoseconds::period> timer;
63+
Timer<std::chrono::nanoseconds::period> host_timer;
64+
long double gbps = 0;
65+
ze_result_t result = ZE_RESULT_SUCCESS;
66+
67+
auto cmd_l = context.command_list;
68+
auto cmd_q = context.command_queue;
69+
auto device = context.device;
70+
71+
if (context.copy_command_queue) {
72+
cmd_l = context.copy_command_list;
73+
cmd_q = context.copy_command_queue;
74+
} else if (context.sub_device_count) {
75+
cmd_l = context.cmd_list[current_sub_device_id];
76+
cmd_q = context.cmd_queue[current_sub_device_id];
77+
device = context.sub_devices[current_sub_device_id];
78+
}
79+
80+
SUCCESS_OR_TERMINATE(zeCommandListReset(cmd_l));
81+
switch (advice) {
82+
case MemoryAdvice::SourceToSystem:
83+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
84+
cmd_l, device, source_buffer, buffer_size,
85+
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
86+
break;
87+
case MemoryAdvice::DestinationToSystem:
88+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
89+
cmd_l, device, destination_buffer, buffer_size,
90+
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
91+
break;
92+
}
93+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmd_l, destination_buffer,
94+
source_buffer, buffer_size,
95+
nullptr, 0, nullptr));
96+
SUCCESS_OR_TERMINATE(zeCommandListClose(cmd_l));
97+
98+
for (uint32_t i = 0; i < warmup_iterations; i++) {
99+
SUCCESS_OR_TERMINATE(
100+
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
101+
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));
102+
}
103+
104+
timer.start();
105+
for (uint32_t i = 0; i < iters; i++) {
106+
SUCCESS_OR_TERMINATE(
107+
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
108+
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));
109+
110+
host_timer.start();
111+
if (is_source_system) {
112+
memset(source_buffer, 0, buffer_size);
113+
} else {
114+
memset(destination_buffer, 0, buffer_size);
115+
}
116+
host_timer.end();
117+
}
118+
timer.end();
119+
long double timed =
120+
timer.period_minus_overhead() - host_timer.period_minus_overhead();
121+
timed /= static_cast<long double>(iters);
122+
123+
switch (advice) {
124+
case MemoryAdvice::SourceToSystem:
125+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
126+
cmd_l, device, source_buffer, buffer_size,
127+
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
128+
break;
129+
case MemoryAdvice::DestinationToSystem:
130+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
131+
cmd_l, device, destination_buffer, buffer_size,
132+
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
133+
break;
134+
}
135+
136+
return calculate_gbps(timed, static_cast<long double>(buffer_size));
137+
}
138+
56139
long double ZePeak::_transfer_bw_host_copy(L0Context &context,
57140
void *destination_buffer,
58141
void *source_buffer,
@@ -364,8 +447,94 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
364447
std::cout << "enqueueReadBuffer : ";
365448
std::cout << gflops << " GB/s\n";
366449

367-
current_sub_device_id = 0;
450+
if ((context.device_memory_access_property.sharedSystemAllocCapabilities &
451+
ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0) {
452+
void *system_memory = malloc(local_memory_size);
453+
if (system_memory == nullptr) {
454+
throw std::runtime_error("malloc failed");
455+
}
456+
memcpy(system_memory, local_memory, local_memory_size);
457+
458+
gflops = 0;
459+
if (context.sub_device_count) {
460+
current_sub_device_id = 0;
461+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
462+
gflops += _transfer_bw_gpu_copy_with_shared_system(
463+
context, dev_out_buf[i], system_memory, local_memory,
464+
local_memory_size / context.sub_device_count, true);
465+
current_sub_device_id++;
466+
}
467+
gflops = gflops / context.sub_device_count;
468+
} else {
469+
gflops = _transfer_bw_gpu_copy_with_shared_system(
470+
context, device_buffer, system_memory, local_memory,
471+
local_memory_size, true);
472+
}
473+
std::cout << "GPU Copy Shared System Memory to Shared Memory : ";
474+
std::cout << gflops << " GB/s\n";
475+
476+
gflops = 0;
477+
if (context.sub_device_count) {
478+
current_sub_device_id = 0;
479+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
480+
gflops += _transfer_bw_gpu_copy_with_shared_system(
481+
context, system_memory, dev_out_buf[i], local_memory,
482+
local_memory_size / context.sub_device_count, false);
483+
current_sub_device_id++;
484+
}
485+
gflops = gflops / context.sub_device_count;
486+
} else {
487+
gflops = _transfer_bw_gpu_copy_with_shared_system(
488+
context, system_memory, device_buffer, local_memory,
489+
local_memory_size, false);
490+
}
491+
std::cout << "GPU Copy Shared System Memory from Shared Memory : ";
492+
std::cout << gflops << " GB/s\n";
493+
494+
gflops = 0;
495+
if (context.sub_device_count) {
496+
current_sub_device_id = 0;
497+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
498+
gflops += _transfer_bw_gpu_copy_with_shared_system(
499+
context, dev_out_buf[i], system_memory, local_memory,
500+
local_memory_size / context.sub_device_count, true,
501+
MemoryAdvice::SourceToSystem);
502+
current_sub_device_id++;
503+
}
504+
gflops = gflops / context.sub_device_count;
505+
} else {
506+
gflops = _transfer_bw_gpu_copy_with_shared_system(
507+
context, device_buffer, system_memory, local_memory,
508+
local_memory_size, true, MemoryAdvice::SourceToSystem);
509+
}
510+
std::cout << "GPU Copy Shared System Memory to Shared Memory with Memory "
511+
"Advice : ";
512+
std::cout << gflops << " GB/s\n";
513+
514+
gflops = 0;
515+
if (context.sub_device_count) {
516+
current_sub_device_id = 0;
517+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
518+
gflops += _transfer_bw_gpu_copy_with_shared_system(
519+
context, system_memory, dev_out_buf[i], local_memory,
520+
local_memory_size / context.sub_device_count, false,
521+
MemoryAdvice::DestinationToSystem);
522+
current_sub_device_id++;
523+
}
524+
gflops = gflops / context.sub_device_count;
525+
} else {
526+
gflops = _transfer_bw_gpu_copy_with_shared_system(
527+
context, system_memory, device_buffer, local_memory,
528+
local_memory_size, false, MemoryAdvice::DestinationToSystem);
529+
}
530+
std::cout << "GPU Copy Shared System Memory from Shared Memory with Memory "
531+
"Advice : ";
532+
std::cout << gflops << " GB/s\n";
368533

534+
free(system_memory);
535+
}
536+
537+
current_sub_device_id = 0;
369538
_transfer_bw_shared_memory(context, local_memory_size, local_memory);
370539

371540
if (context.sub_device_count) {

perf_tests/ze_peak/src/ze_peak.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,18 @@ void L0Context::init_xe(uint32_t specified_driver, uint32_t specified_device,
320320
if (verbose)
321321
std::cout << "Device Compute Properties retrieved\n";
322322

323+
device_memory_access_property.stype =
324+
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
325+
device_memory_access_property.pNext = nullptr;
326+
result = zeDeviceGetMemoryAccessProperties(device,
327+
&device_memory_access_property);
328+
if (result) {
329+
throw std::runtime_error("zeDeviceGetMemoryAccessProperties failed: " +
330+
std::to_string(result));
331+
}
332+
if (verbose)
333+
std::cout << "Device Memory Access Properties retrieved\n";
334+
323335
zeDeviceGetSubDevices(device, &sub_device_count, nullptr);
324336
if (verbose)
325337
std::cout << "Sub Device Count retrieved\n";

0 commit comments

Comments
 (0)