Skip to content

Commit 3257c44

Browse files
authored
SWDEV-505672 - Add performance test for HtoD and DtoH with kernel in-between (#27)
1 parent 552ed61 commit 3257c44

File tree

2 files changed

+125
-0
lines changed

2 files changed

+125
-0
lines changed

catch/performance/memcpy/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ set(TEST_SRC
3030
hipMemcpyDtoHAsync.cc
3131
hipMemcpyHtoD.cc
3232
hipMemcpyHtoDAsync.cc
33+
hipMemcpyWithKernel.cc
3334
hipMemcpyToSymbol.cc
3435
hipMemcpyToSymbolAsync.cc
3536
hipMemcpyFromSymbol.cc
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
3+
Permission is hereby granted, free of charge, to any person obtaining a copy
4+
of this software and associated documentation files (the "Software"), to deal
5+
in the Software without restriction, including without limitation the rights
6+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7+
copies of the Software, and to permit persons to whom the Software is
8+
furnished to do so, subject to the following conditions:
9+
The above copyright notice and this permission notice shall be included in
10+
all copies or substantial portions of the Software.
11+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
13+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
14+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
15+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
17+
THE SOFTWARE.
18+
*/
19+
#include "memcpy_performance_common.hh"
20+
/**
21+
* @addtogroup memcpy memcpy
22+
* @{
23+
* @ingroup PerformanceTest
24+
*/
25+
__global__ void Sum(void* ptr, size_t size) {
26+
size_t index = blockDim.x * blockIdx.x + threadIdx.x;
27+
if (index != 0 && index < size) {
28+
atomicAdd(&((unsigned long long*)ptr)[0], ((unsigned long long*)ptr)[index]);
29+
}
30+
}
31+
class MemcpyHtoDKernelDtoHv1AsyncBenchmark : public Benchmark<MemcpyHtoDKernelDtoHv1AsyncBenchmark> {
32+
public:
33+
void operator()(void* host_mem, void* device_mem, size_t size, const hipStream_t& stream) {
34+
size_t count = size / sizeof(size_t);
35+
for (size_t i = 0; i < count; i++) {
36+
((size_t*)host_mem)[i] = i;
37+
}
38+
TIMED_SECTION_STREAM(kTimerTypeCpu, stream) {
39+
HIP_CHECK(hipMemcpyHtoDAsync((hipDeviceptr_t)device_mem, host_mem, size, stream));
40+
int threads_num = 32;
41+
Sum<<<count / threads_num + 1, threads_num, 0, stream>>>(device_mem, count);
42+
HIP_CHECK(hipMemcpyDtoHAsync(host_mem, (hipDeviceptr_t)device_mem, size, stream));
43+
HIP_CHECK(hipStreamSynchronize(stream));
44+
}
45+
size_t sum = ((size_t*)host_mem)[0];
46+
REQUIRE(sum == count * (count - 1) / 2);
47+
}
48+
};
49+
class MemcpyHtoDKernelDtoHv2AsyncBenchmark : public Benchmark<MemcpyHtoDKernelDtoHv2AsyncBenchmark> {
50+
public:
51+
void operator()(void* host_mem, void* device_mem, size_t size, const hipStream_t& stream) {
52+
size_t count = size / sizeof(size_t);
53+
for (size_t i = 0; i < count; i++) {
54+
((size_t*)host_mem)[i] = i;
55+
}
56+
TIMED_SECTION_STREAM(kTimerTypeCpu, stream) {
57+
HIP_CHECK(hipMemcpyAsync((hipDeviceptr_t)device_mem, host_mem, size, hipMemcpyHostToDevice,
58+
stream));
59+
int threads_num = 32;
60+
Sum<<<count / threads_num + 1, threads_num, 0, stream>>>(device_mem, count);
61+
HIP_CHECK(hipMemcpyWithStream(host_mem, (hipDeviceptr_t)device_mem, size,
62+
hipMemcpyDeviceToHost, stream));
63+
HIP_CHECK(hipStreamSynchronize(stream));
64+
}
65+
size_t sum = ((size_t*)host_mem)[0];
66+
REQUIRE(sum == count * (count - 1) / 2);
67+
}
68+
};
69+
template<typename BenchmarkType>
70+
static void RunBenchmark(LinearAllocs host_allocation_type, LinearAllocs device_allocation_type,
71+
size_t size) {
72+
BenchmarkType benchmark;
73+
if (size < 1_KB) {
74+
benchmark.AddSectionName(std::to_string(size));
75+
} else if (size < 1_MB) {
76+
benchmark.AddSectionName(std::to_string(size / 1024) + std::string(" KB"));
77+
} else {
78+
benchmark.AddSectionName(std::to_string(size / (1024 * 1024)) + std::string(" MB"));
79+
}
80+
benchmark.AddSectionName(GetAllocationSectionName(host_allocation_type));
81+
const StreamGuard stream_guard(Streams::created);
82+
const hipStream_t stream = stream_guard.stream();
83+
LinearAllocGuard<size_t> device_allocation(device_allocation_type, size);
84+
LinearAllocGuard<size_t> host_allocation(host_allocation_type, size);
85+
benchmark.Run(host_allocation.ptr(), device_allocation.ptr(), size, stream);
86+
}
87+
/**
88+
* Test Description
89+
* ------------------------
90+
* - Executes `hipMemcpyHtoDAsync->Kernel->hipMemcpyDtoHAsync` from Device to Host:
91+
* -# Allocation size
92+
* - Small: 4 KB
93+
* - Medium: 4 MB
94+
* - Large: 16 MB
95+
* -# Allocation type
96+
* - Source: device malloc
97+
* - Destination: host pinned and pageable
98+
* Test source
99+
* ------------------------
100+
* - performance/memcpy/hipMemcpyDtoHAsync.cc
101+
* Test requirements
102+
* ------------------------
103+
* - HIP_VERSION >= 5.2
104+
*/
105+
TEST_CASE("Performance_hipMemcpyHtoDKernelDtoHV1Async") {
106+
const auto allocation_size =
107+
GENERATE(16, 128, 1_KB, 4_KB, 16_KB, 256_KB, 512_KB, 1_MB, 4_MB, 16_MB, 128_MB);
108+
const auto device_allocation_type = LinearAllocs::hipMalloc;
109+
const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
110+
RunBenchmark<MemcpyHtoDKernelDtoHv1AsyncBenchmark>(host_allocation_type, device_allocation_type,
111+
allocation_size);
112+
}
113+
TEST_CASE("Performance_hipMemcpyHtoDKernelDtoHV2Async") {
114+
const auto allocation_size =
115+
GENERATE(16, 128, 1_KB, 4_KB, 16_KB, 256_KB, 512_KB, 1_MB, 4_MB, 16_MB, 128_MB);
116+
const auto device_allocation_type = LinearAllocs::hipMalloc;
117+
const auto host_allocation_type = GENERATE(LinearAllocs::malloc, LinearAllocs::hipHostMalloc);
118+
RunBenchmark<MemcpyHtoDKernelDtoHv2AsyncBenchmark>(host_allocation_type, device_allocation_type,
119+
allocation_size);
120+
}
121+
/**
122+
* End doxygen group memcpy.
123+
* @}
124+
*/

0 commit comments

Comments
 (0)