Skip to content

Commit f7225a3

Browse files
feat(ze_peak): Add Shared System Memory transfer bandwidth tests
Signed-off-by: Misiak, Konstanty <[email protected]>
1 parent f42c89e commit f7225a3

File tree

4 files changed

+189
-1
lines changed

4 files changed

+189
-1
lines changed

perf_tests/ze_peak/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ ze_peak measures the following:
1515
* Double Precision Compute in GigaFlops
1616
* Integer Compute in GigaInteger Flops
1717
* Memory Transfer Bandwidth in GigaBytes Per Second
18+
* GPU Copy Shared System Memory <-> Device Memory
1819
* GPU Copy Host <-> Shared Memory
1920
* System Memory Copy Host <-> Shared Memory
2021
* Kernel Launch Latency in micro seconds

perf_tests/ze_peak/include/ze_peak.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ struct L0Context {
7575
nullptr};
7676
ze_device_compute_properties_t device_compute_property = {
7777
ZE_STRUCTURE_TYPE_DEVICE_COMPUTE_PROPERTIES, nullptr};
78+
ze_device_memory_access_properties_t device_memory_access_property = {
79+
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES, nullptr};
7880
bool verbose = false;
7981
std::vector<ze_command_queue_group_properties_t> queueProperties;
8082

@@ -155,9 +157,15 @@ class ZePeak {
155157
#endif
156158

157159
private:
160+
enum class MemoryAdvice { None, SourceToSystem, DestinationToSystem };
161+
158162
long double _transfer_bw_gpu_copy(L0Context &context,
159163
void *destination_buffer,
160164
void *source_buffer, size_t buffer_size);
165+
long double _transfer_bw_gpu_copy_with_shared_system(
166+
L0Context &context, void *destination_buffer, void *source_buffer,
167+
void *input_buffer, size_t buffer_size, bool is_source_system,
168+
MemoryAdvice advice = MemoryAdvice::None);
161169
long double _transfer_bw_host_copy(L0Context &context,
162170
void *destination_buffer,
163171
void *source_buffer, size_t buffer_size,

perf_tests/ze_peak/src/transfer_bw.cpp

Lines changed: 168 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,87 @@ long double ZePeak::_transfer_bw_gpu_copy(L0Context &context,
5353
return calculate_gbps(timed, static_cast<long double>(buffer_size));
5454
}
5555

56+
long double ZePeak::_transfer_bw_gpu_copy_with_shared_system(
57+
L0Context &context, void *destination_buffer, void *source_buffer,
58+
void *input_buffer, size_t buffer_size, bool is_source_system,
59+
MemoryAdvice advice) {
60+
Timer<std::chrono::nanoseconds::period> timer;
61+
Timer<std::chrono::nanoseconds::period> host_timer;
62+
long double gbps = 0;
63+
ze_result_t result = ZE_RESULT_SUCCESS;
64+
65+
auto cmd_l = context.command_list;
66+
auto cmd_q = context.command_queue;
67+
auto device = context.device;
68+
69+
if (context.copy_command_queue) {
70+
cmd_l = context.copy_command_list;
71+
cmd_q = context.copy_command_queue;
72+
} else if (context.sub_device_count) {
73+
cmd_l = context.cmd_list[current_sub_device_id];
74+
cmd_q = context.cmd_queue[current_sub_device_id];
75+
device = context.sub_devices[current_sub_device_id];
76+
}
77+
78+
SUCCESS_OR_TERMINATE(zeCommandListReset(cmd_l));
79+
switch (advice) {
80+
case MemoryAdvice::SourceToSystem:
81+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
82+
cmd_l, device, source_buffer, buffer_size,
83+
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
84+
break;
85+
case MemoryAdvice::DestinationToSystem:
86+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
87+
cmd_l, device, destination_buffer, buffer_size,
88+
ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
89+
break;
90+
}
91+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemoryCopy(cmd_l, destination_buffer,
92+
source_buffer, buffer_size,
93+
nullptr, 0, nullptr));
94+
SUCCESS_OR_TERMINATE(zeCommandListClose(cmd_l));
95+
96+
for (uint32_t i = 0; i < warmup_iterations; i++) {
97+
SUCCESS_OR_TERMINATE(
98+
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
99+
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));
100+
}
101+
102+
timer.start();
103+
for (uint32_t i = 0; i < iters; i++) {
104+
SUCCESS_OR_TERMINATE(
105+
zeCommandQueueExecuteCommandLists(cmd_q, 1, &cmd_l, nullptr));
106+
SUCCESS_OR_TERMINATE(zeCommandQueueSynchronize(cmd_q, UINT64_MAX));
107+
108+
host_timer.start();
109+
if (is_source_system) {
110+
memset(source_buffer, 0, buffer_size);
111+
} else {
112+
memset(destination_buffer, 0, buffer_size);
113+
}
114+
host_timer.end();
115+
}
116+
timer.end();
117+
long double timed =
118+
timer.period_minus_overhead() - host_timer.period_minus_overhead();
119+
timed /= static_cast<long double>(iters);
120+
121+
switch (advice) {
122+
case MemoryAdvice::SourceToSystem:
123+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
124+
cmd_l, device, source_buffer, buffer_size,
125+
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
126+
break;
127+
case MemoryAdvice::DestinationToSystem:
128+
SUCCESS_OR_TERMINATE(zeCommandListAppendMemAdvise(
129+
cmd_l, device, destination_buffer, buffer_size,
130+
ZE_MEMORY_ADVICE_CLEAR_SYSTEM_MEMORY_PREFERRED_LOCATION));
131+
break;
132+
}
133+
134+
return calculate_gbps(timed, static_cast<long double>(buffer_size));
135+
}
136+
56137
long double ZePeak::_transfer_bw_host_copy(L0Context &context,
57138
void *destination_buffer,
58139
void *source_buffer,
@@ -364,8 +445,94 @@ void ZePeak::ze_peak_transfer_bw(L0Context &context) {
364445
std::cout << "enqueueReadBuffer : ";
365446
std::cout << gflops << " GB/s\n";
366447

367-
current_sub_device_id = 0;
448+
if ((context.device_memory_access_property.sharedSystemAllocCapabilities &
449+
ZE_MEMORY_ACCESS_CAP_FLAG_RW) != 0) {
450+
void *system_memory = malloc(local_memory_size);
451+
if (system_memory == nullptr) {
452+
throw std::runtime_error("malloc failed");
453+
}
454+
memcpy(system_memory, local_memory, local_memory_size);
455+
456+
gflops = 0;
457+
if (context.sub_device_count) {
458+
current_sub_device_id = 0;
459+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
460+
gflops += _transfer_bw_gpu_copy_with_shared_system(
461+
context, dev_out_buf[i], system_memory, local_memory,
462+
local_memory_size / context.sub_device_count, true);
463+
current_sub_device_id++;
464+
}
465+
gflops = gflops / context.sub_device_count;
466+
} else {
467+
gflops = _transfer_bw_gpu_copy_with_shared_system(
468+
context, device_buffer, system_memory, local_memory,
469+
local_memory_size, true);
470+
}
471+
std::cout << "GPU Copy Shared System Memory to Shared Memory : ";
472+
std::cout << gflops << " GB/s\n";
473+
474+
gflops = 0;
475+
if (context.sub_device_count) {
476+
current_sub_device_id = 0;
477+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
478+
gflops += _transfer_bw_gpu_copy_with_shared_system(
479+
context, system_memory, dev_out_buf[i], local_memory,
480+
local_memory_size / context.sub_device_count, false);
481+
current_sub_device_id++;
482+
}
483+
gflops = gflops / context.sub_device_count;
484+
} else {
485+
gflops = _transfer_bw_gpu_copy_with_shared_system(
486+
context, system_memory, device_buffer, local_memory,
487+
local_memory_size, false);
488+
}
489+
std::cout << "GPU Copy Shared System Memory from Shared Memory : ";
490+
std::cout << gflops << " GB/s\n";
491+
492+
gflops = 0;
493+
if (context.sub_device_count) {
494+
current_sub_device_id = 0;
495+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
496+
gflops += _transfer_bw_gpu_copy_with_shared_system(
497+
context, dev_out_buf[i], system_memory, local_memory,
498+
local_memory_size / context.sub_device_count, true,
499+
MemoryAdvice::SourceToSystem);
500+
current_sub_device_id++;
501+
}
502+
gflops = gflops / context.sub_device_count;
503+
} else {
504+
gflops = _transfer_bw_gpu_copy_with_shared_system(
505+
context, device_buffer, system_memory, local_memory,
506+
local_memory_size, true, MemoryAdvice::SourceToSystem);
507+
}
508+
std::cout << "GPU Copy Shared System Memory to Shared Memory with Memory "
509+
"Advice : ";
510+
std::cout << gflops << " GB/s\n";
511+
512+
gflops = 0;
513+
if (context.sub_device_count) {
514+
current_sub_device_id = 0;
515+
for (uint32_t i = 0U; i < context.sub_device_count; i++) {
516+
gflops += _transfer_bw_gpu_copy_with_shared_system(
517+
context, system_memory, dev_out_buf[i], local_memory,
518+
local_memory_size / context.sub_device_count, false,
519+
MemoryAdvice::DestinationToSystem);
520+
current_sub_device_id++;
521+
}
522+
gflops = gflops / context.sub_device_count;
523+
} else {
524+
gflops = _transfer_bw_gpu_copy_with_shared_system(
525+
context, system_memory, device_buffer, local_memory,
526+
local_memory_size, false, MemoryAdvice::DestinationToSystem);
527+
}
528+
std::cout << "GPU Copy Shared System Memory from Shared Memory with Memory "
529+
"Advice : ";
530+
std::cout << gflops << " GB/s\n";
368531

532+
free(system_memory);
533+
}
534+
535+
current_sub_device_id = 0;
369536
_transfer_bw_shared_memory(context, local_memory_size, local_memory);
370537

371538
if (context.sub_device_count) {

perf_tests/ze_peak/src/ze_peak.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,18 @@ void L0Context::init_xe(uint32_t specified_driver, uint32_t specified_device,
320320
if (verbose)
321321
std::cout << "Device Compute Properties retrieved\n";
322322

323+
device_memory_access_property.stype =
324+
ZE_STRUCTURE_TYPE_DEVICE_MEMORY_ACCESS_PROPERTIES;
325+
device_memory_access_property.pNext = nullptr;
326+
result = zeDeviceGetMemoryAccessProperties(device,
327+
&device_memory_access_property);
328+
if (result) {
329+
throw std::runtime_error("zeDeviceGetMemoryAccessProperties failed: " +
330+
std::to_string(result));
331+
}
332+
if (verbose)
333+
std::cout << "Device Memory Access Properties retrieved\n";
334+
323335
zeDeviceGetSubDevices(device, &sub_device_count, nullptr);
324336
if (verbose)
325337
std::cout << "Sub Device Count retrieved\n";

0 commit comments

Comments
 (0)