-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[Offload] Add liboffload unit tests for shared/local memory #147040
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-offload Author: Ross Brunton (RossBrunton) ChangesFull diff: https://github.com/llvm/llvm-project/pull/147040.diff 4 Files Affected:
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 132c7a7c51fb8..acc57f3fa3473 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,6 +2,8 @@ add_offload_test_device_code(foo.c foo)
add_offload_test_device_code(bar.c bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.c noargs -O3)
+add_offload_test_device_code(localmem.c localmem)
+add_offload_test_device_code(localmem_reduction.c localmem_reduction)
-add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
+add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin localmem.bin localmem_reduction.bin)
set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/localmem.c b/offload/unittests/OffloadAPI/device_code/localmem.c
new file mode 100644
index 0000000000000..d70847900bc43
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/localmem.c
@@ -0,0 +1,11 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern __gpu_local uint32_t shared_mem[];
+
+__gpu_kernel void localmem(uint32_t *out) {
+ shared_mem[__gpu_thread_id(0)] = __gpu_thread_id(0);
+ shared_mem[__gpu_thread_id(0)] *= 2;
+ out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
+ shared_mem[__gpu_thread_id(0)];
+}
diff --git a/offload/unittests/OffloadAPI/device_code/localmem_reduction.c b/offload/unittests/OffloadAPI/device_code/localmem_reduction.c
new file mode 100644
index 0000000000000..8a9a46cfb6a11
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/localmem_reduction.c
@@ -0,0 +1,16 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern __gpu_local uint32_t shared_mem[];
+
+__gpu_kernel void localmem_reduction(uint32_t *out) {
+ shared_mem[__gpu_thread_id(0)] = 2;
+
+ __gpu_sync_threads();
+
+ if (__gpu_thread_id(0) == 0) {
+ out[__gpu_block_id(0)] = 0;
+ for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
+ out[__gpu_block_id(0)] += shared_mem[i];
+ }
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index a3da334afccac..639a790de8b4a 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -52,6 +52,21 @@ struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
};
OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
+struct olLaunchKernelLocalMemTest : LaunchKernelTestBase {
+ void SetUp() override {
+ RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("localmem"));
+ }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemTest);
+
+struct olLaunchKernelLocalMemReductionTest : LaunchKernelTestBase {
+ void SetUp() override {
+ RETURN_ON_FATAL_FAILURE(
+ LaunchKernelTestBase::SetUpKernel("localmem_reduction"));
+ }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemReductionTest);
+
TEST_P(olLaunchKernelTest, Success) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
@@ -99,3 +114,51 @@ TEST_P(olLaunchKernelTest, SuccessSynchronous) {
ASSERT_SUCCESS(olMemFree(Mem));
}
+
+TEST_P(olLaunchKernelLocalMemTest, Success) {
+ LaunchArgs.NumGroups.x = 4;
+ LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
+
+ void *Mem;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x *
+ sizeof(uint32_t),
+ &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
+ ASSERT_EQ(Data[i], (i % 64) * 2);
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+}
+
+TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
+ LaunchArgs.NumGroups.x = 4;
+ LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
+
+ void *Mem;
+ ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+ LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
+ struct {
+ void *Mem;
+ } Args{Mem};
+
+ ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+ &LaunchArgs, nullptr));
+
+ ASSERT_SUCCESS(olWaitQueue(Queue));
+
+ uint32_t *Data = (uint32_t *)Mem;
+ for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
+ ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
+
+ ASSERT_SUCCESS(olMemFree(Mem));
+}
|
| #include <gpuintrin.h> | ||
| #include <stdint.h> | ||
|
|
||
| extern __gpu_local uint32_t shared_mem[]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No test for static shared memory? That would require [[clang::loader_uninitialized]] to work.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've added one. Since the number of tests is getting a bit large, I've also added a macro to the olLaunchKernel test.
No description provided.