Skip to content

Conversation

@RossBrunton
Copy link
Contributor

@RossBrunton RossBrunton commented Jun 5, 2025

This is equivalent to cuOccupancyMaxPotentialBlockSize. It is currently
only implemented on Cuda; AMDGPU and Host return unsupported.

Co-Authored-By: Callum Fare [email protected]

@llvmbot
Copy link
Member

llvmbot commented Jun 5, 2025

@llvm/pr-subscribers-offload

@llvm/pr-subscribers-backend-amdgpu

Author: Ross Brunton (RossBrunton)

Changes

This is equivalent to cuOccupancyMaxPotentialBlockSize. It is currently
only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal
value of 1.

Co-Authored-By: Callum Fare <[email protected]>


Full diff: https://github.com/llvm/llvm-project/pull/142950.diff

10 Files Affected:

  • (modified) offload/liboffload/API/Kernel.td (+13)
  • (modified) offload/liboffload/src/OffloadImpl.cpp (+19-1)
  • (modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+8)
  • (modified) offload/plugins-nextgen/common/include/PluginInterface.h (+3)
  • (modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp (+1)
  • (modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (+3)
  • (modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+14)
  • (modified) offload/plugins-nextgen/host/src/rtl.cpp (+7)
  • (modified) offload/unittests/OffloadAPI/CMakeLists.txt (+1)
  • (added) offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp (+37)
diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..71869cf4a68bb 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -24,6 +24,19 @@ def : Function {
     let returns = [];
 }
 
+def : Function {
+    let name = "olKernelMaxGroupSize";
+    let desc = "Get the maximum block size needed to achieve maximum occupancy.";
+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+        Param<"size_t", "SharedMemory", "dynamic shared memory required", PARAM_IN>,
+        Param<"size_t*", "GroupSize", "maximum block size", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
 def : Struct {
     let name = "ol_kernel_launch_size_args_t";
     let desc = "Size-related arguments for a kernel launch.";
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7b67cbba43e68..a3f8d4ba52d1e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -468,6 +468,10 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }
 
+inline GenericKernelTy *getOmpKernel(ol_kernel_handle_t OlKernel) {
+  return reinterpret_cast<GenericKernelTy *>(OlKernel);
+}
+
 Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
                        ol_kernel_handle_t *Kernel) {
 
@@ -484,6 +488,20 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
   return Error::success();
 }
 
+Error olKernelMaxGroupSize_impl(ol_kernel_handle_t Kernel,
+                                ol_device_handle_t Device,
+                                size_t DynamicMemSize, size_t *GroupSize) {
+  auto *KernelImpl = getOmpKernel(Kernel);
+
+  auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+  if (auto Err = Res.takeError()) {
+    return Err;
+  }
+  *GroupSize = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_kernel_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
@@ -514,7 +532,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   // Don't do anything with pointer indirection; use arg data as-is
   LaunchArgs.Flags.IsCUDA = true;
 
-  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+  auto *KernelImpl = getOmpKernel(Kernel);
   auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
                                 LaunchArgs, AsyncInfoWrapper);
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c15..bed9764bddf55 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -570,6 +570,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                size_t DynamicMemSize) const override {
+    return 1;
+  }
+
   /// Print more elaborate kernel launch info for AMDGPU
   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
                                KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6f..5d4e9fa212f52 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -276,6 +276,9 @@ struct GenericKernelTy {
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
+  virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                        size_t DynamicMemSize) const = 0;
+
   /// Get the kernel name.
   const char *getName() const { return Name; }
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..e6699ee78596d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -71,6 +71,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..2c856c68a9368 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -289,6 +289,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);
 
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
@@ -370,5 +371,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        const CUmemAllocationProp *prop,
                                        CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                          CUoccupancyB2DSize, size_t, int);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..45d9647da9e53 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &,
+                                size_t DynamicMemSize) const override {
+    int minGridSize;
+    int maxBlockSize;
+    auto Res = cuOccupancyMaxPotentialBlockSize(
+        &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+    if (auto Err = Plugin::check(
+            Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+      return Err;
+    }
+    return maxBlockSize;
+  }
+
 private:
   /// The CUDA kernel function to execute.
   CUfunction Func;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab250..a96aa346d33e5 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -114,6 +114,13 @@ struct GenELF64KernelTy : public GenericKernelTy {
     return Plugin::success();
   }
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &Device,
+                                size_t DynamicMemSize) const override {
+    // TODO
+    return 1;
+  }
+
 private:
   /// The kernel function to execute.
   void (*Func)(void);
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 2844b675e5de1..ac302d502c30c 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -14,6 +14,7 @@ add_offload_unittest("event"
 
 add_offload_unittest("kernel"
     kernel/olGetKernel.cpp
+    kernel/olKernelMaxGroupSize.cpp
     kernel/olLaunchKernel.cpp)
 
 add_offload_unittest("memory"
diff --git a/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
new file mode 100644
index 0000000000000..e83775ae0d896
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
@@ -0,0 +1,37 @@
+//===------- Offload API tests - olKernelMaxGroupSize ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olKernelMaxGroupSizeTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olKernelMaxGroupSizeTest);
+
+TEST_P(olKernelMaxGroupSizeTest, Success) {
+  size_t Size{0};
+  ASSERT_SUCCESS(olKernelMaxGroupSize(Kernel, Device, 0, &Size));
+  ASSERT_GT(Size, 0);
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullKernel) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olKernelMaxGroupSize(nullptr, Device, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullDevice) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olKernelMaxGroupSize(Kernel, nullptr, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullOutput) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olKernelMaxGroupSize(Kernel, Device, 0, nullptr));
+}

Comment on lines 573 to 581
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calculating occupancy is difficult on AMD GPUs, I'd need to dig up the code where HIP does it to see how it's done.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a look and found https://github.com/ROCm/hip/blob/854768787ee9bbd6ed22b3e8fd0f139955a57e6a/src/hip_module.cpp#L1015 (which might be a bit out of date). I took one look at it and decided it was probably worth doing as a separate change.

@RossBrunton
Copy link
Contributor Author

Just to summarize and get discussion in one place: As input we have a kernel name/handle, device and local memory size. From that we want to output the maximum group size.

In terms of API design, I can see multiple options:

typedef void *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, Device, 1024, &GroupSize);
struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, 1024, &GroupSize);
struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
  size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", 1024, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);
struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
  size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
struct ol_kernel_invocation {
  ol_program_handle_t Program;
  const char *Name;
  size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(&Invocation, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);
struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
};
struct ol_kernel_invocation {
  size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olGetKernelInfo(Kernel, &Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);
struct ol_kernel_invocation {
  ol_program_handle_t Program;
  const char *Name;
  size_t DynMemorySize;
  // Should not be used by API users
  GenericKernelTy *PluginKernel;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024, nullptr};
size_t GroupSize;
olKernelLoad(&Invocation);
olGetKernelInfo(Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

Which of these options looks best to you, @callumfare @jhuber6 ?

@callumfare
Copy link
Contributor

I don't mind the kernel being an opaque handle like the other types as opposed to void* like it is now. If the underlying pointer is useful to end-users we can expose it with something like olGetKernelNativeHandle. Long-term we'll need functions like that for every handle type anyway to support SYCL's native interop functionality.

I don't think we should have any state kept in the kernel (same reason we don't set arguments like UR/OpenCL) so I'm not a fan of any option where we store DynMemorySize in the kernel. Likewise adding a ol_kernel_invocation type seems overkill.

Option 2 seems best if we change ol_kernel_handle_t, otherwise option 1 seems fine.

@RossBrunton RossBrunton changed the title [Offload] Add olKernelMaxGroupSize [Offload] Add olGetKernelMaxGroupSize Jun 10, 2025
@RossBrunton
Copy link
Contributor Author

After a small discussion internally at Codeplay, I've renamed the function from olKernelMaxGroupSize to olGetKernelMaxGroupSize.

@RossBrunton
Copy link
Contributor Author

@jhuber6 When you get a chance can you look at the options I posted above and let me know your thoughts?

@jhuber6
Copy link
Contributor

jhuber6 commented Jun 16, 2025

So, we should probably just do something similar to HSA and make a generic symbol type then have a single function to get information out of it.

@RossBrunton
Copy link
Contributor Author

@jhuber6 So something like option 2, but with ol_symbol_handle_t instead of ol_kernel_handle_t?

I think having an olGetKernelMaxGroupSize (distinct from olGetSymbolInfo) method is pretty much unavoidable if we don't want to have the symbol type store DynamicMemorySize.

@RossBrunton
Copy link
Contributor Author

@jhuber6 I'm still trying to wrap my head around what you are looking for here. olGetKernelInfo or even having ol_kernel_handle_t be a non-opaque type makes sense for other queries, but the max group size depends on the local memory size (which as far as I can tell only the caller knows about).

We could have a dedicated olGetKernelLaunchInfo (or similar) that accepts a kernel, device and local memory size and populates a provided struct with the work group size, but I don't know what information other than the max WG size such a struct could contain.

@jhuber6
Copy link
Contributor

jhuber6 commented Jul 1, 2025

@jhuber6 I'm still trying to wrap my head around what you are looking for here. olGetKernelInfo or even having ol_kernel_handle_t be a non-opaque type makes sense for other queries, but the max group size depends on the local memory size (which as far as I can tell only the caller knows about).

We could have a dedicated olGetKernelLaunchInfo (or similar) that accepts a kernel, device and local memory size and populates a provided struct with the work group size, but I don't know what information other than the max WG size such a struct could contain.

Mostly I'm just wary of having such a specific function when I feel like this should be something generic queried from a kernel. We should probably just go with what others like HSA do and make the kernel type a handle with an info function that can be used to get stuff like this, most importantly the address.

@RossBrunton
Copy link
Contributor Author

@jhuber6 I'm not a fan of that approach; it means that every time a kernel handle is created, a bunch of functors have to be created, even if the implementation never uses them. It also makes plugin code much more complicated.

If reducing the number of API functions is the goal, how about the following:

uint64_t LocalSize = getLocalSizeFromSomewhere();
uint64_t MaxGroupSize;
olQueryKernel(Device, Kernel, OL_KERNEL_QUERY_MAX_GROUP_SIZE, sizeof(MaxGroupSize), MaxGroupSize, &LocalSize);

We add a function similar to olGetKernelInfo, but have it accept an additional const void *. This pointer points to an "argument" to the query type; in this case a const uint64_t * pointing to the local size. If we need to add future device info in the future, we can add new OL_KERNEL_QUERY_*s, each with a specific argument type.

@jhuber6
Copy link
Contributor

jhuber6 commented Jul 1, 2025

We already have an internal kernel type, don't we? Likely we just need something similar to the set / get attributes for kernels that CUDA / HSA provide. Though I suppose that this is just another level because most would probably be at the device level? Like I said, HSA already does this so there's some precedent.

@RossBrunton
Copy link
Contributor Author

@jhuber6 RossBrunton@a8d1be8 Just so we're talking about the same thing, I've made a commit containing what I think you're suggesting.

I'm still not a fan of this approach - it's very different to how we've been handling other entry points and can't take advantage of our tablegen validation/loader layer code generation.

Of course, if other LLVM people prefer this approach, I'm happy to be outvoted.

@RossBrunton
Copy link
Contributor Author

@jhuber6 For some context, I'm looking to implement urKernelGetSuggestedLocalWorkSize ( https://oneapi-src.github.io/unified-runtime/core/api.html#urkernelgetsuggestedlocalworksize ) which does a lot of magic but for the Cuda backend boils down to a call to cuOccupancyMaxPotentialBlockSize. This function basically takes in a kernel and shared memory size (i.e. dynamic memory used), and spits out the maximum number of work items that can fit on the device. Thus an equivalent offload API would also need to take in a kernel and memory size, meaning it can't use the normal (hypothetical) olGetKernelInfo interface.

Unless we want to rethink how we store kernels, I think a dedicated olGetKernelMaxGroupSize function is the best option.

@RossBrunton
Copy link
Contributor Author

@jhuber6 @callumfare This has been re-written with the new API changes, mind having a look?

Copy link
Contributor

@jhuber6 jhuber6 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know we've probably covered this before but I need to reorient myself here. But what was the issue with something like this again?

olGetSymbolInfo(symbol_t symbol, attribute attr, void *data);

uint32_t size;
symbol_t sym = olGetSymbol("foo");
olGetSymbolinfo(sym, OL_SYMBOL_INFO_MAX_GROUP_SIZE, &size);

@RossBrunton
Copy link
Contributor Author

@jhuber6 The max work group size is dependant on the amount of dynamic memory the kernel will launch with, so we need to have that passed in as well.

@jhuber6
Copy link
Contributor

jhuber6 commented Jul 25, 2025

@jhuber6 The max work group size is dependant on the amount of dynamic memory the kernel will launch with, so we need to have that passed in as well.

Does it? OpenCL has https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clGetDeviceInfo.html which supports CL_DEVICE_MAX_WORK_GROUP_SIZE. The descriptions states

Maximum number of work-items in a work-group that a device is capable of executing on a single compute unit, for any given kernel-instance running on the device. (Refer also to clEnqueueNDRangeKernel and CL_KERNEL_WORK_GROUP_SIZE ). The minimum value is 1. The returned value is an upper limit and will not necessarily maximize performance. This maximum may be larger than supported by a specific kernel (refer to the CL_KERNEL_WORK_GROUP_SIZE query of clGetKernelWorkGroupInfo).

So, why can't we do the same?

@RossBrunton
Copy link
Contributor Author

@jhuber6 I think a more appropriate comparison would be the cuOccupancyMaxPotentialBlockSize CUDA function or clGetKernelSuggestedLocalWorkSizeKHR. As I understand it, CL_DEVICE_MAX_WORK_GROUP_SIZE is the maximum work group size for any kernel, and specific kernels that require a lot of resources may have a smaller limit.

I'm looking to implement urKernelGetSuggestedLocalWorkSize on top of liboffload. The cuda UR backend uses cuOccupancyMaxPotentialBlockSize, so I wanted to implement a way to expose that function through liboffload.

@jhuber6
Copy link
Contributor

jhuber6 commented Jul 25, 2025

I guess my main concern is that this function is pretty specific, that function CUDA provides is based off of their own heuristic. I think it's correct to pass more threads with whatever dynamic shared memory usage, but it changed performance? I'm honestly wondering if this wouldn't be better solved by a custom option to the kernel launch instead.

@RossBrunton
Copy link
Contributor Author

I want to expose the number directly to UR (I guess so users can try out different values for local memory and see which works best). I don't think having a launch option for "determine the best option" would fix this (although I don't disagree with the existence of such a function entirely).

It's currently specific to Nvidia, but an implementation can be made for AMD (as it is in HIP) and users will probably appreciate having it available in the liboffload library rather than having to roll it themselves.

@RossBrunton
Copy link
Contributor Author

@jhuber6 This has been sitting here for a while now, and I'd like to either merge it or close it. I think practically speaking it has to be it's own function different from the getKernelInfo function because of the dynamic size requirement.

I think this makes sense to add to the liboffload API. There's an equavilent API in cuda which seems to be commonly used, and providing it in liboffload (and for AMD when it gets implemented) will make porting code easier.

Copy link
Contributor

@jhuber6 jhuber6 left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose it's useful in the name of portability, is it the maximum group size or just the optimal one.

RossBrunton and others added 3 commits August 19, 2025 11:34
This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently
only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal
value of `1`.

Co-Authored-By: Callum Fare <[email protected]>
@RossBrunton RossBrunton changed the title [Offload] Add olGetKernelMaxGroupSize [Offload] Add olCalculateMaxOccupancy Aug 19, 2025
@github-actions
Copy link

github-actions bot commented Aug 19, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@RossBrunton
Copy link
Contributor Author

@jhuber6 I've taken another look and done another revamp (sorry I'm being so messy about this). Hopefully the new name and description clear it up, but as far as I can tell the optimal size is the max size.

@RossBrunton RossBrunton changed the title [Offload] Add olCalculateMaxOccupancy [Offload] Add olCalculateOptimalOccupancy Aug 19, 2025
@RossBrunton RossBrunton merged commit 2c11a83 into llvm:main Aug 19, 2025
9 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants