Skip to content

Commit 6ea7054

Browse files
authored
[HIP] Add a __builtin_memset test (#276)
This is mostly analogous to the existing memmove test: __builtin_memset is called on the device with different lengths, in different address spaces, at different dynamic alignments, and with constant and dynamic operands. The result is compared to a reference computation on the host to ensure functional correctness of the memset lowering. Also adds a missing standard lib include to the memmove test. For SWDEV-543208.
1 parent 69f057c commit 6ea7054

File tree

4 files changed

+363
-0
lines changed

4 files changed

+363
-0
lines changed

External/HIP/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ macro(create_local_hip_tests VariantSuffix)
7070
list(APPEND HIP_LOCAL_TESTS with-fopenmp)
7171
list(APPEND HIP_LOCAL_TESTS saxpy)
7272
list(APPEND HIP_LOCAL_TESTS memmove)
73+
list(APPEND HIP_LOCAL_TESTS memset)
7374
list(APPEND HIP_LOCAL_TESTS split-kernel-args)
7475
list(APPEND HIP_LOCAL_TESTS builtin-logb-scalbn)
7576

External/HIP/memmove.hip

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <cassert>
22
#include <cstring>
33
#include <iostream>
4+
#include <memory>
45
#include <vector>
56

67
#include "hip/hip_runtime.h"

External/HIP/memset.hip

Lines changed: 359 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,359 @@
1+
#include <cassert>
2+
#include <cstdint>
3+
#include <cstring>
4+
#include <iostream>
5+
#include <memory>
6+
#include <vector>
7+
8+
#include "hip/hip_runtime.h"
9+
10+
// Tests for the functional correctness of the lowering of memset in device
11+
// code. Various memsets are performed on device side and the result of each is
12+
// compared to the corresponding operation on the host. Global, shared, and
13+
// stack memory is tested.
14+
15+
#define VERBOSE 0
16+
17+
#define CHKHIP(r) \
18+
if (r != hipSuccess) { \
19+
std::cerr << hipGetErrorString(r) << std::endl; \
20+
abort(); \
21+
}
22+
23+
// Maximal number of bytes to set with a memset call, used to allocate
24+
// buffers.
25+
static constexpr size_t MaxBytesPerThread = 2048;
26+
27+
// LDS is small, so run only smaller tests there.
28+
static constexpr size_t MaxLDSBytesPerThread = 128;
29+
30+
// Number of threads started in parallel.
31+
static constexpr size_t NumMoveThreads = 2 * 32;
32+
33+
// Size of blocks in the grid used for threads. If the number of threads is
34+
// smaller than this, it is used instead.
35+
static constexpr size_t BlockSize = 256;
36+
37+
static constexpr size_t AllocSize = 2 * NumMoveThreads * MaxBytesPerThread;
38+
39+
static constexpr size_t LDSAllocSize =
40+
2 * NumMoveThreads * MaxLDSBytesPerThread;
41+
42+
enum AddressSpace {
43+
GLOBAL = 0,
44+
SHARED = 1,
45+
STACK = 2,
46+
};
47+
48+
static const char *as_names[] = {
49+
"global",
50+
"shared",
51+
"stack",
52+
};
53+
54+
static constexpr size_t get_stride(size_t bytes_per_thread) {
55+
return 2 * bytes_per_thread;
56+
}
57+
58+
/// Initialize \p alloc_size bytes of \p buf_device to increasing numbers
59+
/// (modulo 256).
60+
__global__ void init_kernel(uint8_t *buf_device, size_t alloc_size) {
61+
for (size_t i = 0; i < alloc_size; ++i) {
62+
buf_device[i] = (uint8_t)i;
63+
}
64+
}
65+
66+
template <size_t SZ, uint8_t SetVal, bool const_size, bool use_tid,
67+
bool const_setval>
68+
__global__ void memset_kernel_global(uint8_t *buf_device, size_t dst_idx,
69+
uint8_t dyn_setval, size_t dyn_sz) {
70+
(void)dyn_sz;
71+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
72+
if (tid >= NumMoveThreads)
73+
return;
74+
uint8_t *thread_buf = buf_device + get_stride(SZ) * tid;
75+
76+
if constexpr (const_size) {
77+
if constexpr (use_tid) {
78+
__builtin_memset(thread_buf + dst_idx, static_cast<uint8_t>(tid), SZ);
79+
} else if constexpr (const_setval) {
80+
__builtin_memset(thread_buf + dst_idx, SetVal, SZ);
81+
} else {
82+
__builtin_memset(thread_buf + dst_idx, dyn_setval, SZ);
83+
}
84+
} else {
85+
if constexpr (use_tid) {
86+
__builtin_memset(thread_buf + dst_idx, static_cast<uint8_t>(tid), dyn_sz);
87+
} else if constexpr (const_setval) {
88+
__builtin_memset(thread_buf + dst_idx, SetVal, dyn_sz);
89+
} else {
90+
__builtin_memset(thread_buf + dst_idx, dyn_setval, dyn_sz);
91+
}
92+
}
93+
}
94+
95+
template <size_t SZ, uint8_t SetVal, bool const_size, bool use_tid,
96+
bool const_setval>
97+
__global__ void memset_kernel_shared(uint8_t *buf_device, size_t dst_idx,
98+
uint8_t dyn_setval, size_t dyn_sz) {
99+
(void)dyn_sz;
100+
__shared__ uint8_t buf_shared[LDSAllocSize];
101+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
102+
if (tid >= NumMoveThreads)
103+
return;
104+
constexpr size_t stride = get_stride(SZ);
105+
uint8_t *thread_buf = buf_device + stride * tid;
106+
uint8_t *thread_buf_shared = buf_shared + stride * tid;
107+
// Copy the original data to shared memory.
108+
__builtin_memcpy(thread_buf_shared, thread_buf, stride);
109+
110+
// Perform the memset there.
111+
if constexpr (const_size) {
112+
if constexpr (use_tid) {
113+
__builtin_memset(thread_buf_shared + dst_idx, (uint8_t)tid, SZ);
114+
} else if constexpr (const_setval) {
115+
__builtin_memset(thread_buf_shared + dst_idx, SetVal, SZ);
116+
} else {
117+
__builtin_memset(thread_buf_shared + dst_idx, dyn_setval, SZ);
118+
}
119+
} else {
120+
if constexpr (use_tid) {
121+
__builtin_memset(thread_buf_shared + dst_idx, (uint8_t)tid, dyn_sz);
122+
} else if constexpr (const_setval) {
123+
__builtin_memset(thread_buf_shared + dst_idx, SetVal, dyn_sz);
124+
} else {
125+
__builtin_memset(thread_buf_shared + dst_idx, dyn_setval, dyn_sz);
126+
}
127+
}
128+
129+
// Copy the modified data back to global memory.
130+
__builtin_memcpy(thread_buf, thread_buf_shared, stride);
131+
}
132+
133+
template <size_t SZ, uint8_t SetVal, bool const_size, bool use_tid,
134+
bool const_setval>
135+
__global__ void memset_kernel_stack(uint8_t *buf_device, size_t dst_idx,
136+
uint8_t dyn_setval, size_t dyn_sz) {
137+
(void)dyn_sz;
138+
constexpr size_t stride = get_stride(SZ);
139+
uint8_t buf_stack[stride];
140+
int tid = blockDim.x * blockIdx.x + threadIdx.x;
141+
if (tid >= NumMoveThreads)
142+
return;
143+
uint8_t *thread_buf = buf_device + stride * tid;
144+
// Copy the original data to the stack.
145+
__builtin_memcpy(buf_stack, thread_buf, stride);
146+
147+
// Perform the memset there.
148+
if constexpr (const_size) {
149+
if constexpr (use_tid) {
150+
__builtin_memset(buf_stack + dst_idx, (uint8_t)tid, SZ);
151+
} else if constexpr (const_setval) {
152+
__builtin_memset(buf_stack + dst_idx, SetVal, SZ);
153+
} else {
154+
__builtin_memset(buf_stack + dst_idx, dyn_setval, SZ);
155+
}
156+
} else {
157+
if constexpr (use_tid) {
158+
__builtin_memset(buf_stack + dst_idx, (uint8_t)tid, dyn_sz);
159+
} else if constexpr (const_setval) {
160+
__builtin_memset(buf_stack + dst_idx, SetVal, dyn_sz);
161+
} else {
162+
__builtin_memset(buf_stack + dst_idx, dyn_setval, dyn_sz);
163+
}
164+
}
165+
166+
// Copy the modified data back to global memory.
167+
__builtin_memcpy(thread_buf, buf_stack, stride);
168+
}
169+
170+
template <size_t SZ, uint8_t SetVal>
171+
bool run_test(uint8_t *buf_reference, uint8_t *buf_host, uint8_t *buf_device,
172+
size_t dst_idx, bool const_size, bool use_tid, bool const_setval,
173+
AddressSpace AS, size_t &differing_pos) {
174+
// Initialize device buffer.
175+
hipLaunchKernelGGL(init_kernel, dim3(1), dim3(1), 0, 0, buf_device,
176+
AllocSize);
177+
CHKHIP(hipDeviceSynchronize());
178+
179+
// Set up the reference buffer.
180+
for (size_t i = 0; i < AllocSize; ++i)
181+
buf_reference[i] = (uint8_t)i;
182+
183+
// Simulate multi-threaded device-side memset on the host.
184+
for (size_t tid = 0; tid < NumMoveThreads; ++tid) {
185+
uint8_t *thread_buf = buf_reference + get_stride(SZ) * tid;
186+
uint8_t v = use_tid ? tid : SetVal;
187+
std::memset(thread_buf + dst_idx, v, SZ);
188+
}
189+
190+
// Do the device-side memset.
191+
int block_size = std::min(BlockSize, NumMoveThreads);
192+
int num_blocks = (NumMoveThreads + block_size - 1) / block_size;
193+
194+
// Select the right kernel with the right template paramters. This is done
195+
// using compile-time constant template parameters so that we can control
196+
// which memset arguments the compiler sees as constant, as this affects code
197+
// generation.
198+
void (*chosen_kernel)(uint8_t *, size_t, uint8_t, size_t) = nullptr;
199+
200+
#define SELECT_KERNEL_FOR_ADDRSPACE(AS) \
201+
if (const_size) { \
202+
if (use_tid) \
203+
chosen_kernel = memset_kernel_##AS<SZ, SetVal, true, true, false>; \
204+
else if (const_setval) \
205+
chosen_kernel = memset_kernel_##AS<SZ, SetVal, true, false, true>; \
206+
else \
207+
chosen_kernel = memset_kernel_##AS<SZ, SetVal, true, false, false>; \
208+
} else { \
209+
if (use_tid) \
210+
chosen_kernel = memset_kernel_##AS<SZ, SetVal, false, true, false>; \
211+
else if (const_setval) \
212+
chosen_kernel = memset_kernel_##AS<SZ, SetVal, false, false, true>; \
213+
else \
214+
chosen_kernel = memset_kernel_##AS<SZ, SetVal, false, false, false>; \
215+
}
216+
217+
switch (AS) {
218+
case AddressSpace::GLOBAL:
219+
SELECT_KERNEL_FOR_ADDRSPACE(global);
220+
break;
221+
case AddressSpace::SHARED:
222+
SELECT_KERNEL_FOR_ADDRSPACE(shared);
223+
break;
224+
case AddressSpace::STACK:
225+
SELECT_KERNEL_FOR_ADDRSPACE(stack);
226+
break;
227+
};
228+
hipLaunchKernelGGL(chosen_kernel, dim3(num_blocks), dim3(block_size), 0, 0,
229+
buf_device, dst_idx, SetVal, SZ);
230+
CHKHIP(hipDeviceSynchronize());
231+
232+
// Fetch the result into buf_host.
233+
CHKHIP(hipMemcpy(buf_host, buf_device, AllocSize, hipMemcpyDeviceToHost));
234+
235+
// Compare to the reference.
236+
bool success = true;
237+
for (size_t i = 0; i < AllocSize; ++i) {
238+
if (buf_host[i] != buf_reference[i]) {
239+
differing_pos = i;
240+
success = false;
241+
break;
242+
}
243+
}
244+
245+
return success;
246+
}
247+
248+
template <size_t SZ, uint8_t SetVal>
249+
int run_tests(uint8_t *buf_reference, uint8_t *buf_host, uint8_t *buf_device,
250+
AddressSpace AS) {
251+
if (AS == AddressSpace::SHARED && SZ > MaxLDSBytesPerThread) {
252+
// LDS is too small for these tests.
253+
return 0;
254+
}
255+
assert(SZ <= MaxBytesPerThread &&
256+
"Increase MaxBytesPerThread for larger sizes");
257+
258+
std::vector<size_t> indexes_to_test = {0, 1, SZ - 1, SZ};
259+
if (SZ > 8) {
260+
indexes_to_test.emplace_back(7);
261+
indexes_to_test.emplace_back(8);
262+
}
263+
if (SZ > 16) {
264+
indexes_to_test.emplace_back(15);
265+
indexes_to_test.emplace_back(16);
266+
}
267+
268+
int nerrs = 0;
269+
270+
size_t differing_pos = 0;
271+
auto test_indexes = [&](bool const_size, bool use_tid, bool const_setval) {
272+
for (const auto &dst_idx : indexes_to_test) {
273+
bool success = run_test<SZ, SetVal>(buf_reference, buf_host, buf_device,
274+
dst_idx, const_size, use_tid,
275+
const_setval, AS, differing_pos);
276+
nerrs += !success;
277+
if (VERBOSE || !success) {
278+
std::cout << "- memsetting [" << dst_idx << ", " << (dst_idx + SZ - 1)
279+
<< "] to ";
280+
if (use_tid) {
281+
std::cout << "the thread id";
282+
} else {
283+
std::cout << static_cast<int>(SetVal) << " ("
284+
<< (const_setval ? "const" : "dynamic") << ")";
285+
}
286+
if (!VERBOSE) {
287+
std::cout << " with " << (const_size ? "static" : "dynamic")
288+
<< " size in " << as_names[AS] << " memory";
289+
}
290+
std::cout << ":";
291+
if (success) {
292+
std::cout << " successful\n";
293+
} else {
294+
std::cout << " failed\n -> first difference at index "
295+
<< differing_pos << '\n';
296+
}
297+
}
298+
}
299+
};
300+
301+
if (VERBOSE)
302+
std::cout << "running tests for dynamic move length " << SZ << " in "
303+
<< as_names[AS] << " memory\n";
304+
test_indexes(false, false, false);
305+
test_indexes(false, false, true);
306+
test_indexes(false, true, false);
307+
308+
// Different paths in codegen are taken if the move length is statically
309+
// known.
310+
if (VERBOSE)
311+
std::cout << "running tests for static move length " << SZ << " in "
312+
<< as_names[AS] << " memory\n";
313+
test_indexes(true, false, false);
314+
test_indexes(true, false, true);
315+
test_indexes(true, true, false);
316+
317+
return nerrs;
318+
}
319+
320+
int main(void) {
321+
uint8_t *buf_device;
322+
CHKHIP(hipMalloc(&buf_device, AllocSize));
323+
324+
std::unique_ptr<uint8_t> buf_host(new uint8_t[AllocSize]);
325+
std::unique_ptr<uint8_t> buf_reference(new uint8_t[AllocSize]);
326+
327+
int nerrs = 0;
328+
for (AddressSpace AS :
329+
{AddressSpace::GLOBAL, AddressSpace::SHARED, AddressSpace::STACK}) {
330+
nerrs += run_tests<64, 0xbb>(buf_reference.get(), buf_host.get(),
331+
buf_device, AS);
332+
nerrs += run_tests<66, 0xbb>(buf_reference.get(), buf_host.get(),
333+
buf_device, AS);
334+
nerrs += run_tests<73, 0xbb>(buf_reference.get(), buf_host.get(),
335+
buf_device, AS);
336+
nerrs +=
337+
run_tests<3, 0xbb>(buf_reference.get(), buf_host.get(), buf_device, AS);
338+
nerrs +=
339+
run_tests<1, 0xbb>(buf_reference.get(), buf_host.get(), buf_device, AS);
340+
341+
// Lengths that are large enough for the IR lowering in the constant
342+
// case, with simple residual, no residual, and maximal residual:
343+
nerrs += run_tests<1025, 0xbb>(buf_reference.get(), buf_host.get(),
344+
buf_device, AS);
345+
nerrs += run_tests<1040, 0xbb>(buf_reference.get(), buf_host.get(),
346+
buf_device, AS);
347+
nerrs += run_tests<1039, 0xbb>(buf_reference.get(), buf_host.get(),
348+
buf_device, AS);
349+
}
350+
351+
CHKHIP(hipFree(buf_device));
352+
353+
if (nerrs != 0) {
354+
std::cout << nerrs << " errors\n";
355+
return 1;
356+
}
357+
std::cout << "PASSED!\n";
358+
return 0;
359+
}
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
PASSED!
2+
exit 0

0 commit comments

Comments
 (0)