Skip to content

Conversation

@RossBrunton
Copy link
Contributor

No description provided.

@llvmbot llvmbot added the offload label Jul 4, 2025
@llvmbot
Copy link
Member

llvmbot commented Jul 4, 2025

@llvm/pr-subscribers-offload

Author: Ross Brunton (RossBrunton)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/147040.diff

4 Files Affected:

  • (modified) offload/unittests/OffloadAPI/device_code/CMakeLists.txt (+3-1)
  • (added) offload/unittests/OffloadAPI/device_code/localmem.c (+11)
  • (added) offload/unittests/OffloadAPI/device_code/localmem_reduction.c (+16)
  • (modified) offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp (+63)
diff --git a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
index 132c7a7c51fb8..acc57f3fa3473 100644
--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,6 +2,8 @@ add_offload_test_device_code(foo.c foo)
 add_offload_test_device_code(bar.c bar)
 # Compile with optimizations to eliminate AMDGPU implicit arguments.
 add_offload_test_device_code(noargs.c noargs -O3)
+add_offload_test_device_code(localmem.c localmem)
+add_offload_test_device_code(localmem_reduction.c localmem_reduction)
 
-add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin)
+add_custom_target(offload_device_binaries DEPENDS foo.bin bar.bin noargs.bin localmem.bin localmem_reduction.bin)
 set(OFFLOAD_TEST_DEVICE_CODE_PATH ${CMAKE_CURRENT_BINARY_DIR} PARENT_SCOPE)
diff --git a/offload/unittests/OffloadAPI/device_code/localmem.c b/offload/unittests/OffloadAPI/device_code/localmem.c
new file mode 100644
index 0000000000000..d70847900bc43
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/localmem.c
@@ -0,0 +1,11 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern __gpu_local uint32_t shared_mem[];
+
+__gpu_kernel void localmem(uint32_t *out) {
+  shared_mem[__gpu_thread_id(0)] = __gpu_thread_id(0);
+  shared_mem[__gpu_thread_id(0)] *= 2;
+  out[__gpu_thread_id(0) + (__gpu_num_threads(0) * __gpu_block_id(0))] =
+      shared_mem[__gpu_thread_id(0)];
+}
diff --git a/offload/unittests/OffloadAPI/device_code/localmem_reduction.c b/offload/unittests/OffloadAPI/device_code/localmem_reduction.c
new file mode 100644
index 0000000000000..8a9a46cfb6a11
--- /dev/null
+++ b/offload/unittests/OffloadAPI/device_code/localmem_reduction.c
@@ -0,0 +1,16 @@
+#include <gpuintrin.h>
+#include <stdint.h>
+
+extern __gpu_local uint32_t shared_mem[];
+
+__gpu_kernel void localmem_reduction(uint32_t *out) {
+  shared_mem[__gpu_thread_id(0)] = 2;
+
+  __gpu_sync_threads();
+
+  if (__gpu_thread_id(0) == 0) {
+    out[__gpu_block_id(0)] = 0;
+    for (uint32_t i = 0; i < __gpu_num_threads(0); i++)
+      out[__gpu_block_id(0)] += shared_mem[i];
+  }
+}
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index a3da334afccac..639a790de8b4a 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -52,6 +52,21 @@ struct olLaunchKernelNoArgsTest : LaunchKernelTestBase {
 };
 OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelNoArgsTest);
 
+struct olLaunchKernelLocalMemTest : LaunchKernelTestBase {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(LaunchKernelTestBase::SetUpKernel("localmem"));
+  }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemTest);
+
+struct olLaunchKernelLocalMemReductionTest : LaunchKernelTestBase {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(
+        LaunchKernelTestBase::SetUpKernel("localmem_reduction"));
+  }
+};
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelLocalMemReductionTest);
+
 TEST_P(olLaunchKernelTest, Success) {
   void *Mem;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
@@ -99,3 +114,51 @@ TEST_P(olLaunchKernelTest, SuccessSynchronous) {
 
   ASSERT_SUCCESS(olMemFree(Mem));
 }
+
+TEST_P(olLaunchKernelLocalMemTest, Success) {
+  LaunchArgs.NumGroups.x = 4;
+  LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
+
+  void *Mem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x *
+                                sizeof(uint32_t),
+                            &Mem));
+  struct {
+    void *Mem;
+  } Args{Mem};
+
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, nullptr));
+
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+
+  uint32_t *Data = (uint32_t *)Mem;
+  for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
+    ASSERT_EQ(Data[i], (i % 64) * 2);
+
+  ASSERT_SUCCESS(olMemFree(Mem));
+}
+
+TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
+  LaunchArgs.NumGroups.x = 4;
+  LaunchArgs.DynSharedMemory = 64 * sizeof(uint32_t);
+
+  void *Mem;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                            LaunchArgs.NumGroups.x * sizeof(uint32_t), &Mem));
+  struct {
+    void *Mem;
+  } Args{Mem};
+
+  ASSERT_SUCCESS(olLaunchKernel(Queue, Device, Kernel, &Args, sizeof(Args),
+                                &LaunchArgs, nullptr));
+
+  ASSERT_SUCCESS(olWaitQueue(Queue));
+
+  uint32_t *Data = (uint32_t *)Mem;
+  for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
+    ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
+
+  ASSERT_SUCCESS(olMemFree(Mem));
+}

@RossBrunton RossBrunton requested review from callumfare and jhuber6 July 4, 2025 11:01
#include <gpuintrin.h>
#include <stdint.h>

extern __gpu_local uint32_t shared_mem[];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No test for static shared memory? That would require [[clang::loader_uninitialized]] to work.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added one. Since the number of tests is getting a bit large, I've also added a macro to the olLaunchKernel test.

@RossBrunton RossBrunton requested a review from jhuber6 July 7, 2025 10:31
@RossBrunton RossBrunton merged commit 8ae8d31 into llvm:main Jul 7, 2025
9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants