Skip to content

Commit c878ebf

Browse files
committed
[Offload] Add liboffload unit tests for shared/local memory
1 parent 0437895 commit c878ebf

File tree

4 files changed

+93
-1
lines changed

4 files changed

+93
-1
lines changed

offload/unittests/OffloadAPI/device_code/CMakeLists.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ add_offload_test_device_code(foo.c foo)
22
add_offload_test_device_code(bar.c bar)
33
# Compile with optimizations to eliminate AMDGPU implicit arguments.
44
add_offload_test_device_code(noargs.c noargs -O3)
5+
add_offload_test_device_code(localmem.c localmem)
6+
add_offload_test_device_code(localmem_reduction.c localmem_reduction)
57

6-
add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
8+
add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin localmem.bin localmem_reduction.bin)
79
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
extern __gpu_local uint32_t shared_mem[];
5+
6+
__gpu_kernel void localmem(uint32_t *out) {
7+
shared_mem[__gpu_thread_id(0)] = __gpu_thread_id(0);
8+
shared_mem[__gpu_thread_id(0)] *= 2;
9+
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
10+
shared_mem[__gpu_thread_id(0)];
11+
}
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#include <gpuintrin.h>
2+
#include <stdint.h>
3+
4+
extern __gpu_local uint32_t shared_mem[];
5+
6+
__gpu_kernel void localmem_reduction(uint32_t *out) {
7+
shared_mem[__gpu_thread_id(0)] = 2;
8+
9+
__gpu_sync_threads();
10+
11+
if (__gpu_thread_id(0) == 0) {
12+
out[__gpu_block_id(0)] = 0;
13+
for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
14+
out[__gpu_block_id(0)] += shared_mem[i];
15+
}
16+
}

offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,21 @@ struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
5252
};
5353
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
5454

55+
struct olLaunchKernelLocalMemTest : LaunchKernelTestBase {
56+
void SetUp() override {
57+
RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("localmem"));
58+
}
59+
};
60+
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemTest);
61+
62+
struct olLaunchKernelLocalMemReductionTest : LaunchKernelTestBase {
63+
void SetUp() override {
64+
RETURN_ON_FATAL_FAILURE(
65+
LaunchKernelTestBase::SetUpKernel("localmem_reduction"));
66+
}
67+
};
68+
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemReductionTest);
69+
5570
TEST_P(olLaunchKernelTest, Success) {
5671
void *Mem;
5772
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
@@ -99,3 +114,51 @@ TEST_P(olLaunchKernelTest, SuccessSynchronous) {
99114

100115
ASSERT_SUCCESS(olMemFree(Mem));
101116
}
117+
118+
TEST_P(olLaunchKernelLocalMemTest, Success) {
119+
LaunchArgs.NumGroups.x = 4;
120+
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
121+
122+
void *Mem;
123+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
124+
LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x *
125+
sizeof(uint32_t),
126+
&Mem));
127+
struct {
128+
void *Mem;
129+
} Args{Mem};
130+
131+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
132+
&LaunchArgs, nullptr));
133+
134+
ASSERT_SUCCESS(olWaitQueue(Queue));
135+
136+
uint32_t *Data = (uint32_t *)Mem;
137+
for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
138+
ASSERT_EQ(Data[i], (i % 64) * 2);
139+
140+
ASSERT_SUCCESS(olMemFree(Mem));
141+
}
142+
143+
TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
144+
LaunchArgs.NumGroups.x = 4;
145+
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
146+
147+
void *Mem;
148+
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
149+
LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
150+
struct {
151+
void *Mem;
152+
} Args{Mem};
153+
154+
ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
155+
&LaunchArgs, nullptr));
156+
157+
ASSERT_SUCCESS(olWaitQueue(Queue));
158+
159+
uint32_t *Data = (uint32_t *)Mem;
160+
for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
161+
ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
162+
163+
ASSERT_SUCCESS(olMemFree(Mem));
164+
}

0 commit comments

Comments
 (0)