Skip to content

Commit 00e2fa6

Browse files
authored
[HIP] Add a correctness test for the memmove intrinsic (#146)
So far, the lowering of device-side memmove intrinsics in HIP (and the AMDGPU backend) is only tested with syntactic regression tests. This patch adds functional correctness tests for the device-side memmove intrinsic with and without overlapping source and destination ranges. By testing various statically known or unknown move lengths, the different lowering mechanisms for memmove in the AMDGPU backend are covered.
1 parent 4a13813 commit 00e2fa6

File tree

3 files changed

+316
-0
lines changed

3 files changed

+316
-0
lines changed

External/HIP/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ macro(create_local_hip_tests VariantSuffix)
1515
list(APPEND HIP_LOCAL_TESTS empty)
1616
list(APPEND HIP_LOCAL_TESTS with-fopenmp)
1717
list(APPEND HIP_LOCAL_TESTS saxpy)
18+
list(APPEND HIP_LOCAL_TESTS memmove)
1819
list(APPEND HIP_LOCAL_TESTS InOneWeekend)
1920
list(APPEND HIP_LOCAL_TESTS TheNextWeek)
2021

External/HIP/memmove.hip

Lines changed: 313 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,313 @@
1+
#include <cassert>
2+
#include <cstring>
3+
#include <iostream>
4+
#include <vector>
5+
6+
#include "hip/hip_runtime.h"
7+
8+
// Tests for the functional correctness of the lowering of memmove in device
9+
// code, including moves with overlapping source and destination ranges. Various
10+
// memmoves are performed on device side and the result of each is compared to
11+
// the corresponding operation on the host.
12+
// Global, shared, and stack memory is tested.
13+
14+
#define VERBOSE 0
15+
16+
#define CHKHIP(r) \
17+
if (r != hipSuccess) { \
18+
std::cerr << hipGetErrorString(r) << std::endl; \
19+
abort(); \
20+
}
21+
22+
using item_type = uint8_t;
23+
24+
// Maximal number of bytes to copy with a memmove call, used to allocate
25+
// buffers.
26+
#define MAX_BYTES_PER_THREAD 2048
27+
28+
// LDS is small, so run only smaller tests there.
29+
#define MAX_BYTES_PER_THREAD_SHARED 128
30+
31+
// Number of threads that move started in parallel.
32+
#define NUM_MOVE_THREADS (2 * 32)
33+
34+
// Size of blocks in the grid used for move threads. If the number of threads is
35+
// smaller than this, it is used instead.
36+
#define BLOCK_SIZE 256
37+
38+
#define ALLOC_SIZE (2 * NUM_MOVE_THREADS * MAX_BYTES_PER_THREAD)
39+
40+
#define ALLOC_SIZE_SHARED (2 * NUM_MOVE_THREADS * MAX_BYTES_PER_THREAD_SHARED)
41+
42+
#define TESTED_FUNCTION __builtin_memmove
43+
44+
enum AddressSpace {
45+
GLOBAL = 0,
46+
SHARED = 1,
47+
STACK = 2,
48+
};
49+
50+
static const char *as_names[] = {
51+
"global",
52+
"shared",
53+
"stack",
54+
};
55+
56+
static constexpr size_t get_stride(size_t bytes_per_thread) {
57+
return 2 * bytes_per_thread;
58+
}
59+
60+
__global__ void init_kernel(item_type *buf_device, size_t alloc_size) {
61+
for (size_t i = 0; i < alloc_size; ++i) {
62+
buf_device[i] = (item_type)i;
63+
}
64+
}
65+
66+
template <size_t SZ>
67+
__global__ void move_kernel_global_const(item_type *buf_device, size_t src_idx,
68+
size_t dst_idx, size_t dyn_sz) {
69+
(void)dyn_sz;
70+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
71+
if (tid >= NUM_MOVE_THREADS)
72+
return;
73+
item_type *thread_buf = buf_device + get_stride(SZ) * tid;
74+
TESTED_FUNCTION(thread_buf + dst_idx, thread_buf + src_idx, SZ);
75+
}
76+
77+
template <size_t SZ>
78+
__global__ void move_kernel_shared_const(item_type *buf_device, size_t src_idx,
79+
size_t dst_idx, size_t dyn_sz) {
80+
(void)dyn_sz;
81+
__shared__ item_type buf_shared[ALLOC_SIZE_SHARED];
82+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
83+
if (tid >= NUM_MOVE_THREADS)
84+
return;
85+
constexpr size_t stride = get_stride(SZ);
86+
item_type *thread_buf = buf_device + stride * tid;
87+
item_type *thread_buf_shared = buf_shared + stride * tid;
88+
// Copy the original data to shared memory.
89+
__builtin_memcpy(thread_buf_shared, thread_buf, stride);
90+
// Perform the move there.
91+
TESTED_FUNCTION(thread_buf_shared + dst_idx, thread_buf_shared + src_idx, SZ);
92+
// Copy the modified data back to global memory.
93+
__builtin_memcpy(thread_buf, thread_buf_shared, stride);
94+
}
95+
96+
template <size_t SZ>
97+
__global__ void move_kernel_stack_const(item_type *buf_device, size_t src_idx,
98+
size_t dst_idx, size_t dyn_sz) {
99+
(void)dyn_sz;
100+
constexpr size_t stride = get_stride(SZ);
101+
item_type buf_stack[stride];
102+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
103+
if (tid >= NUM_MOVE_THREADS)
104+
return;
105+
item_type *thread_buf = buf_device + stride * tid;
106+
// Copy the original data to the stack.
107+
__builtin_memcpy(buf_stack, thread_buf, stride);
108+
// Perform the move there.
109+
TESTED_FUNCTION(buf_stack + dst_idx, buf_stack + src_idx, SZ);
110+
// Copy the modified data back to global memory.
111+
__builtin_memcpy(thread_buf, buf_stack, stride);
112+
}
113+
114+
__global__ void move_kernel_global_var(item_type *buf_device, size_t src_idx,
115+
size_t dst_idx, size_t dyn_sz) {
116+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
117+
if (tid >= NUM_MOVE_THREADS)
118+
return;
119+
item_type *thread_buf = buf_device + get_stride(dyn_sz) * tid;
120+
TESTED_FUNCTION(thread_buf + dst_idx, thread_buf + src_idx, dyn_sz);
121+
}
122+
123+
__global__ void move_kernel_shared_var(item_type *buf_device, size_t src_idx,
124+
size_t dst_idx, size_t dyn_sz) {
125+
__shared__ item_type buf_shared[ALLOC_SIZE_SHARED];
126+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
127+
if (tid >= NUM_MOVE_THREADS)
128+
return;
129+
size_t stride = get_stride(dyn_sz);
130+
item_type *thread_buf = buf_device + stride * tid;
131+
item_type *thread_buf_shared = buf_shared + stride * tid;
132+
// Copy the original data to shared memory.
133+
__builtin_memcpy(thread_buf_shared, thread_buf, stride);
134+
// perform the move there
135+
TESTED_FUNCTION(thread_buf_shared + dst_idx, thread_buf_shared + src_idx,
136+
dyn_sz);
137+
// Copy the modified data back to global memory.
138+
__builtin_memcpy(thread_buf, thread_buf_shared, stride);
139+
}
140+
141+
template <size_t SZ>
142+
__global__ void move_kernel_stack_var(item_type *buf_device, size_t src_idx,
143+
size_t dst_idx, size_t dyn_sz) {
144+
// We use the static SZ to allocate a fixed-size stack variable.
145+
constexpr size_t stride = get_stride(SZ);
146+
item_type buf_stack[stride];
147+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
148+
if (tid >= NUM_MOVE_THREADS)
149+
return;
150+
item_type *thread_buf = buf_device + stride * tid;
151+
// Copy the original data to the stack.
152+
__builtin_memcpy(buf_stack, thread_buf, stride);
153+
// perform the move there
154+
TESTED_FUNCTION(buf_stack + dst_idx, buf_stack + src_idx, dyn_sz);
155+
// Copy the modified data back to global memory.
156+
__builtin_memcpy(thread_buf, buf_stack, stride);
157+
}
158+
159+
template <size_t SZ>
160+
bool run_test(item_type *buf_reference, item_type *buf_host,
161+
item_type *buf_device, size_t src_idx, size_t dst_idx,
162+
bool const_size, AddressSpace AS, size_t &differing_pos) {
163+
// Initialize device buffer.
164+
hipLaunchKernelGGL(init_kernel, dim3(1), dim3(1), 0, 0, buf_device,
165+
ALLOC_SIZE);
166+
CHKHIP(hipDeviceSynchronize());
167+
168+
// Set up the reference buffer.
169+
for (size_t i = 0; i < ALLOC_SIZE; ++i)
170+
buf_reference[i] = (item_type)i;
171+
172+
// Simulate multi-threaded device-side memmove on the host.
173+
for (size_t tid = 0; tid < NUM_MOVE_THREADS; ++tid) {
174+
item_type *thread_buf = buf_reference + get_stride(SZ) * tid;
175+
std::memmove(thread_buf + dst_idx, thread_buf + src_idx, SZ);
176+
}
177+
178+
// Do the device-side memmove.
179+
int block_size = std::min(BLOCK_SIZE, NUM_MOVE_THREADS);
180+
int num_blocks = (NUM_MOVE_THREADS + block_size - 1) / block_size;
181+
182+
switch (AS) {
183+
case AddressSpace::GLOBAL:
184+
hipLaunchKernelGGL(const_size ? move_kernel_global_const<SZ>
185+
: move_kernel_global_var,
186+
dim3(num_blocks), dim3(block_size), 0, 0, buf_device,
187+
src_idx, dst_idx, SZ);
188+
break;
189+
case AddressSpace::SHARED:
190+
hipLaunchKernelGGL(const_size ? move_kernel_shared_const<SZ>
191+
: move_kernel_shared_var,
192+
dim3(num_blocks), dim3(block_size), 0, 0, buf_device,
193+
src_idx, dst_idx, SZ);
194+
break;
195+
case AddressSpace::STACK:
196+
hipLaunchKernelGGL(const_size ? move_kernel_stack_const<SZ>
197+
: move_kernel_stack_var<SZ>,
198+
dim3(num_blocks), dim3(block_size), 0, 0, buf_device,
199+
src_idx, dst_idx, SZ);
200+
break;
201+
};
202+
CHKHIP(hipDeviceSynchronize());
203+
204+
// Fetch the result into buf_host.
205+
CHKHIP(hipMemcpy(buf_host, buf_device, ALLOC_SIZE, hipMemcpyDeviceToHost));
206+
207+
// Compare to the reference.
208+
bool success = true;
209+
for (size_t i = 0; i < ALLOC_SIZE; ++i) {
210+
if (buf_host[i] != buf_reference[i]) {
211+
differing_pos = i;
212+
success = false;
213+
break;
214+
}
215+
}
216+
217+
return success;
218+
}
219+
220+
template <size_t SZ>
221+
int run_tests(item_type *buf_reference, item_type *buf_host,
222+
item_type *buf_device, AddressSpace AS) {
223+
if (AS == AddressSpace::SHARED && SZ > MAX_BYTES_PER_THREAD_SHARED) {
224+
// LDS is too small for these tests.
225+
return 0;
226+
}
227+
assert(SZ <= MAX_BYTES_PER_THREAD &&
228+
"Increase MAX_BYTES_PER_THREAD for larger sizes");
229+
230+
std::vector<std::pair<size_t, size_t>> index_combinations = {
231+
{0, 1}, {0, SZ}, {0, SZ - 1}, {1, 0}, {SZ, 0}, {SZ - 1, 0},
232+
};
233+
if (SZ > 16) {
234+
index_combinations.emplace_back(0, 16);
235+
index_combinations.emplace_back(16, 0);
236+
}
237+
238+
int nerrs = 0;
239+
240+
size_t differing_pos = 0;
241+
auto test_index_combinations = [&](bool const_size) {
242+
for (const auto &[src_idx, dst_idx] : index_combinations) {
243+
bool success = run_test<SZ>(buf_reference, buf_host, buf_device, src_idx,
244+
dst_idx, const_size, AS, differing_pos);
245+
nerrs += !success;
246+
if (VERBOSE || !success) {
247+
std::cout << "- moving [" << src_idx << ", " << (src_idx + SZ - 1)
248+
<< "] -> [" << dst_idx << ", " << (dst_idx + SZ - 1) << "]";
249+
if (!VERBOSE) {
250+
std::cout << " with " << (const_size ? "static" : "dynamic")
251+
<< " size in " << as_names[AS] << " memory";
252+
}
253+
std::cout << ":";
254+
if (success) {
255+
std::cout << " successful\n";
256+
} else {
257+
std::cout << " failed\n -> first difference at index "
258+
<< differing_pos << '\n';
259+
}
260+
}
261+
}
262+
};
263+
264+
if (VERBOSE)
265+
std::cout << "running tests for dynamic move length " << SZ << " in "
266+
<< as_names[AS] << " memory\n";
267+
test_index_combinations(false);
268+
269+
// Different paths in codegen are taken if the move length is statically
270+
// known.
271+
if (VERBOSE)
272+
std::cout << "running tests for static move length " << SZ << " in "
273+
<< as_names[AS] << " memory\n";
274+
test_index_combinations(true);
275+
276+
return nerrs;
277+
}
278+
279+
int main(void) {
280+
item_type *buf_device;
281+
CHKHIP(hipMalloc(&buf_device, ALLOC_SIZE));
282+
283+
std::unique_ptr<item_type> buf_host(new item_type[ALLOC_SIZE]);
284+
std::unique_ptr<item_type> buf_reference(new item_type[ALLOC_SIZE]);
285+
286+
int nerrs = 0;
287+
for (AddressSpace AS :
288+
{AddressSpace::GLOBAL, AddressSpace::SHARED, AddressSpace::STACK}) {
289+
nerrs += run_tests<64>(buf_reference.get(), buf_host.get(), buf_device, AS);
290+
nerrs += run_tests<66>(buf_reference.get(), buf_host.get(), buf_device, AS);
291+
nerrs += run_tests<73>(buf_reference.get(), buf_host.get(), buf_device, AS);
292+
nerrs += run_tests<3>(buf_reference.get(), buf_host.get(), buf_device, AS);
293+
nerrs += run_tests<1>(buf_reference.get(), buf_host.get(), buf_device, AS);
294+
295+
// Move lengths that are large enough for the IR lowering in the constant
296+
// case, with simple residual, no residual, and maximal residual:
297+
nerrs +=
298+
run_tests<1025>(buf_reference.get(), buf_host.get(), buf_device, AS);
299+
nerrs +=
300+
run_tests<1040>(buf_reference.get(), buf_host.get(), buf_device, AS);
301+
nerrs +=
302+
run_tests<1039>(buf_reference.get(), buf_host.get(), buf_device, AS);
303+
}
304+
305+
CHKHIP(hipFree(buf_device));
306+
307+
if (nerrs != 0) {
308+
std::cout << nerrs << " errors\n";
309+
return 1;
310+
}
311+
std::cout << "PASSED!\n";
312+
return 0;
313+
}

External/HIP/memmove.reference_output

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
PASSED!
2+
exit 0

0 commit comments

Comments
 (0)