Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion offload/unittests/OffloadAPI/device_code/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ add_offload_test_device_code(foo.c foo)
add_offload_test_device_code(bar.c bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.c noargs -O3)
add_offload_test_device_code(localmem.c localmem)
add_offload_test_device_code(localmem_reduction.c localmem_reduction)

add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin localmem.bin localmem_reduction.bin)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
11 changes: 11 additions & 0 deletions offload/unittests/OffloadAPI/device_code/localmem.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#include <gpuintrin.h>
#include <stdint.h>

extern __gpu_local uint32_t shared_mem[];

__gpu_kernel void localmem(uint32_t *out) {
shared_mem[__gpu_thread_id(0)] = __gpu_thread_id(0);
shared_mem[__gpu_thread_id(0)] *= 2;
out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
shared_mem[__gpu_thread_id(0)];
}
16 changes: 16 additions & 0 deletions offload/unittests/OffloadAPI/device_code/localmem_reduction.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include <gpuintrin.h>
#include <stdint.h>

extern __gpu_local uint32_t shared_mem[];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No test for static shared memory? That would require [[clang::loader_uninitialized]] to work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added one. Since the number of tests is getting a bit large, I've also added a macro to the olLaunchKernel test.


__gpu_kernel void localmem_reduction(uint32_t *out) {
shared_mem[__gpu_thread_id(0)] = 2;

__gpu_sync_threads();

if (__gpu_thread_id(0) == 0) {
out[__gpu_block_id(0)] = 0;
for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
out[__gpu_block_id(0)] += shared_mem[i];
}
}
63 changes: 63 additions & 0 deletions offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,21 @@ struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
};
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);

struct olLaunchKernelLocalMemTest : LaunchKernelTestBase {
void SetUp() override {
RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("localmem"));
}
};
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemTest);

struct olLaunchKernelLocalMemReductionTest : LaunchKernelTestBase {
void SetUp() override {
RETURN_ON_FATAL_FAILURE(
LaunchKernelTestBase::SetUpKernel("localmem_reduction"));
}
};
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemReductionTest);

TEST_P(olLaunchKernelTest, Success) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
Expand Down Expand Up @@ -99,3 +114,51 @@ TEST_P(olLaunchKernelTest, SuccessSynchronous) {

ASSERT_SUCCESS(olMemFree(Mem));
}

TEST_P(olLaunchKernelLocalMemTest, Success) {
LaunchArgs.NumGroups.x = 4;
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);

void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x *
sizeof(uint32_t),
&Mem));
struct {
void *Mem;
} Args{Mem};

ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
&LaunchArgs, nullptr));

ASSERT_SUCCESS(olWaitQueue(Queue));

uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
ASSERT_EQ(Data[i], (i % 64) * 2);

ASSERT_SUCCESS(olMemFree(Mem));
}

TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
LaunchArgs.NumGroups.x = 4;
LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);

void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
struct {
void *Mem;
} Args{Mem};

ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
&LaunchArgs, nullptr));

ASSERT_SUCCESS(olWaitQueue(Queue));

uint32_t *Data = (uint32_t *)Mem;
for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);

ASSERT_SUCCESS(olMemFree(Mem));
}
Loading