[Offload] Add olCalculateOptimalOccupancy #142950

RossBrunton · 2025-06-05T12:05:05Z

This is equivalent to cuOccupancyMaxPotentialBlockSize. It is currently
only implemented on Cuda; AMDGPU and Host return unsupported.

Co-Authored-By: Callum Fare [email protected]

llvmbot · 2025-06-05T12:05:41Z

@llvm/pr-subscribers-offload

@llvm/pr-subscribers-backend-amdgpu

Author: Ross Brunton (RossBrunton)

Changes

This is equivalent to cuOccupancyMaxPotentialBlockSize. It is currently
only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal
value of 1.

Co-Authored-By: Callum Fare <[email protected]>

Full diff: https://github.com/llvm/llvm-project/pull/142950.diff

10 Files Affected:

(modified) offload/liboffload/API/Kernel.td (+13)
(modified) offload/liboffload/src/OffloadImpl.cpp (+19-1)
(modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (+8)
(modified) offload/plugins-nextgen/common/include/PluginInterface.h (+3)
(modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp (+1)
(modified) offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h (+3)
(modified) offload/plugins-nextgen/cuda/src/rtl.cpp (+14)
(modified) offload/plugins-nextgen/host/src/rtl.cpp (+7)
(modified) offload/unittests/OffloadAPI/CMakeLists.txt (+1)
(added) offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp (+37)

diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..71869cf4a68bb 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -24,6 +24,19 @@ def : Function {
     let returns = [];
 }
 
+def : Function {
+    let name = "olKernelMaxGroupSize";
+    let desc = "Get the maximum block size needed to achieve maximum occupancy.";
+    let details = [];
+    let params = [
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"ol_device_handle_t", "Device", "device intended to run the kernel", PARAM_IN>,
+        Param<"size_t", "SharedMemory", "dynamic shared memory required", PARAM_IN>,
+        Param<"size_t*", "GroupSize", "maximum block size", PARAM_OUT>
+    ];
+    let returns = [];
+}
+
 def : Struct {
     let name = "ol_kernel_launch_size_args_t";
     let desc = "Size-related arguments for a kernel launch.";
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7b67cbba43e68..a3f8d4ba52d1e 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -468,6 +468,10 @@ Error olDestroyProgram_impl(ol_program_handle_t Program) {
   return olDestroy(Program);
 }
 
+inline GenericKernelTy *getOmpKernel(ol_kernel_handle_t OlKernel) {
+  return reinterpret_cast<GenericKernelTy *>(OlKernel);
+}
+
 Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
                        ol_kernel_handle_t *Kernel) {
 
@@ -484,6 +488,20 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
   return Error::success();
 }
 
+Error olKernelMaxGroupSize_impl(ol_kernel_handle_t Kernel,
+                                ol_device_handle_t Device,
+                                size_t DynamicMemSize, size_t *GroupSize) {
+  auto *KernelImpl = getOmpKernel(Kernel);
+
+  auto Res = KernelImpl->maxGroupSize(*Device->Device, DynamicMemSize);
+  if (auto Err = Res.takeError()) {
+    return Err;
+  }
+  *GroupSize = *Res;
+
+  return Error::success();
+}
+
 Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           ol_kernel_handle_t Kernel, const void *ArgumentsData,
                           size_t ArgumentsSize,
@@ -514,7 +532,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   // Don't do anything with pointer indirection; use arg data as-is
   LaunchArgs.Flags.IsCUDA = true;
 
-  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+  auto *KernelImpl = getOmpKernel(Kernel);
   auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
                                 LaunchArgs, AsyncInfoWrapper);
 
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e4c32713e2c15..bed9764bddf55 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -570,6 +570,14 @@ struct AMDGPUKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  ///
+  /// TODO: This needs to be implemented for amdgpu
+  Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                size_t DynamicMemSize) const override {
+    return 1;
+  }
+
   /// Print more elaborate kernel launch info for AMDGPU
   Error printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
                                KernelArgsTy &KernelArgs, uint32_t NumThreads[3],
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index d2437908a0a6f..5d4e9fa212f52 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -276,6 +276,9 @@ struct GenericKernelTy {
                            KernelLaunchParamsTy LaunchParams,
                            AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
 
+  virtual Expected<size_t> maxGroupSize(GenericDeviceTy &GenericDevice,
+                                        size_t DynamicMemSize) const = 0;
+
   /// Get the kernel name.
   const char *getName() const { return Name; }
 
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index e5332686fcffb..e6699ee78596d 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -71,6 +71,7 @@ DLWRAP(cuDevicePrimaryCtxGetState, 3)
 DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
+DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
 
 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
diff --git a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 1c5b421768894..2c856c68a9368 100644
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -289,6 +289,7 @@ static inline void *CU_LAUNCH_PARAM_BUFFER_POINTER = (void *)0x01;
 static inline void *CU_LAUNCH_PARAM_BUFFER_SIZE = (void *)0x02;
 
 typedef void (*CUstreamCallback)(CUstream, CUresult, void *);
+typedef size_t (*CUoccupancyB2DSize)(int);
 
 CUresult cuCtxGetDevice(CUdevice *);
 CUresult cuDeviceGet(CUdevice *, int);
@@ -370,5 +371,7 @@ CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
 CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                        const CUmemAllocationProp *prop,
                                        CUmemAllocationGranularity_flags option);
+CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
+                                          CUoccupancyB2DSize, size_t, int);
 
 #endif
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index 44ccfc47a21c9..45d9647da9e53 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -157,6 +157,20 @@ struct CUDAKernelTy : public GenericKernelTy {
                    KernelLaunchParamsTy LaunchParams,
                    AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &,
+                                size_t DynamicMemSize) const override {
+    int minGridSize;
+    int maxBlockSize;
+    auto Res = cuOccupancyMaxPotentialBlockSize(
+        &minGridSize, &maxBlockSize, Func, NULL, DynamicMemSize, INT_MAX);
+    if (auto Err = Plugin::check(
+            Res, "error in cuOccupancyMaxPotentialBlockSize: %s")) {
+      return Err;
+    }
+    return maxBlockSize;
+  }
+
 private:
   /// The CUDA kernel function to execute.
   CUfunction Func;
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index 9916f4d0ab250..a96aa346d33e5 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -114,6 +114,13 @@ struct GenELF64KernelTy : public GenericKernelTy {
     return Plugin::success();
   }
 
+  /// Return maximum block size for maximum occupancy
+  Expected<size_t> maxGroupSize(GenericDeviceTy &Device,
+                                size_t DynamicMemSize) const override {
+    // TODO
+    return 1;
+  }
+
 private:
   /// The kernel function to execute.
   void (*Func)(void);
diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 2844b675e5de1..ac302d502c30c 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -14,6 +14,7 @@ add_offload_unittest("event"
 
 add_offload_unittest("kernel"
     kernel/olGetKernel.cpp
+    kernel/olKernelMaxGroupSize.cpp
     kernel/olLaunchKernel.cpp)
 
 add_offload_unittest("memory"
diff --git a/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
new file mode 100644
index 0000000000000..e83775ae0d896
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olKernelMaxGroupSize.cpp
@@ -0,0 +1,37 @@
+//===------- Offload API tests - olKernelMaxGroupSize ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+using olKernelMaxGroupSizeTest = OffloadKernelTest;
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olKernelMaxGroupSizeTest);
+
+TEST_P(olKernelMaxGroupSizeTest, Success) {
+  size_t Size{0};
+  ASSERT_SUCCESS(olKernelMaxGroupSize(Kernel, Device, 0, &Size));
+  ASSERT_GT(Size, 0);
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullKernel) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olKernelMaxGroupSize(nullptr, Device, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullDevice) {
+  size_t Size;
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_HANDLE,
+               olKernelMaxGroupSize(Kernel, nullptr, 0, &Size));
+}
+
+TEST_P(olKernelMaxGroupSizeTest, NullOutput) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER,
+               olKernelMaxGroupSize(Kernel, Device, 0, nullptr));
+}

offload/liboffload/API/Kernel.td

offload/liboffload/src/OffloadImpl.cpp

jhuber6 · 2025-06-05T12:12:22Z

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Calculating occupancy is difficult on AMD GPUs, I'd need to dig up the code where HIP does it to see how it's done.

I had a look and found https://github.com/ROCm/hip/blob/854768787ee9bbd6ed22b3e8fd0f139955a57e6a/src/hip_module.cpp#L1015 (which might be a bit out of date). I took one look at it and decided it was probably worth doing as a separate change.

offload/plugins-nextgen/common/include/PluginInterface.h

RossBrunton · 2025-06-06T11:00:41Z

Just to summarize and get discussion in one place: As input we have a kernel name/handle, device and local memory size. From that we want to output the maximum group size.

In terms of API design, I can see multiple options:

typedef void *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, Device, 1024, &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olKernelMaxGroupSize(Kernel, 1024, &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
  size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
[...]
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", 1024, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
  size_t DynMemorySize;
};
typedef ol_kernel_handle_impl *ol_kernel_handle_t;
struct ol_kernel_invocation {
  ol_program_handle_t Program;
  const char *Name;
  size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(&Invocation, &Kernel);
olGetKernelInfo(Kernel, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

struct ol_kernel_handle_impl {
  GenericKernelTy *PluginKernel;
  ol_program_handle_t Program;
};
struct ol_kernel_invocation {
  size_t DynMemorySize;
};
[...]
ol_kernel_invocation Invocation{1024};
size_t GroupSize;
ol_kernel_handle_t Kernel;
olGetKernel(Program, "foo", &Kernel);
olGetKernelInfo(Kernel, &Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

struct ol_kernel_invocation {
  ol_program_handle_t Program;
  const char *Name;
  size_t DynMemorySize;
  // Should not be used by API users
  GenericKernelTy *PluginKernel;
};
[...]
ol_kernel_invocation Invocation{Program, "foo", 1024, nullptr};
size_t GroupSize;
olKernelLoad(&Invocation);
olGetKernelInfo(Invocation, OL_KERNEL_INFO_MAX_GROUP_SIZE, sizeof(GroupSize), &GroupSize);

Which of these options looks best to you, @callumfare @jhuber6 ?

callumfare · 2025-06-09T10:23:41Z

I don't mind the kernel being an opaque handle like the other types as opposed to void* like it is now. If the underlying pointer is useful to end-users we can expose it with something like olGetKernelNativeHandle. Long-term we'll need functions like that for every handle type anyway to support SYCL's native interop functionality.

I don't think we should have any state kept in the kernel (same reason we don't set arguments like UR/OpenCL) so I'm not a fan of any option where we store DynMemorySize in the kernel. Likewise adding a ol_kernel_invocation type seems overkill.

Option 2 seems best if we change ol_kernel_handle_t, otherwise option 1 seems fine.

RossBrunton · 2025-06-10T14:39:30Z

After a small discussion internally at Codeplay, I've renamed the function from olKernelMaxGroupSize to olGetKernelMaxGroupSize.

RossBrunton · 2025-06-16T11:12:53Z

@jhuber6 When you get a chance can you look at the options I posted above and let me know your thoughts?

jhuber6 · 2025-06-16T13:11:18Z

So, we should probably just do something similar to HSA and make a generic symbol type then have a single function to get information out of it.

RossBrunton · 2025-06-18T15:25:47Z

@jhuber6 So something like option 2, but with ol_symbol_handle_t instead of ol_kernel_handle_t?

I think having an olGetKernelMaxGroupSize (distinct from olGetSymbolInfo) method is pretty much unavoidable if we don't want to have the symbol type store DynamicMemorySize.

RossBrunton · 2025-07-01T09:32:35Z

@jhuber6 I'm still trying to wrap my head around what you are looking for here. olGetKernelInfo or even having ol_kernel_handle_t be a non-opaque type makes sense for other queries, but the max group size depends on the local memory size (which as far as I can tell only the caller knows about).

We could have a dedicated olGetKernelLaunchInfo (or similar) that accepts a kernel, device and local memory size and populates a provided struct with the work group size, but I don't know what information other than the max WG size such a struct could contain.

jhuber6 · 2025-07-01T11:57:43Z

@jhuber6 I'm still trying to wrap my head around what you are looking for here. olGetKernelInfo or even having ol_kernel_handle_t be a non-opaque type makes sense for other queries, but the max group size depends on the local memory size (which as far as I can tell only the caller knows about).

We could have a dedicated olGetKernelLaunchInfo (or similar) that accepts a kernel, device and local memory size and populates a provided struct with the work group size, but I don't know what information other than the max WG size such a struct could contain.

Mostly I'm just wary of having such a specific function when I feel like this should be something generic queried from a kernel. We should probably just go with what others like HSA do and make the kernel type a handle with an info function that can be used to get stuff like this, most importantly the address.

RossBrunton · 2025-07-01T12:56:23Z

@jhuber6 I'm not a fan of that approach; it means that every time a kernel handle is created, a bunch of functors have to be created, even if the implementation never uses them. It also makes plugin code much more complicated.

If reducing the number of API functions is the goal, how about the following:

uint64_t LocalSize = getLocalSizeFromSomewhere();
uint64_t MaxGroupSize;
olQueryKernel(Device, Kernel, OL_KERNEL_QUERY_MAX_GROUP_SIZE, sizeof(MaxGroupSize), MaxGroupSize, &LocalSize);

We add a function similar to olGetKernelInfo, but have it accept an additional const void *. This pointer points to an "argument" to the query type; in this case a const uint64_t * pointing to the local size. If we need to add future device info in the future, we can add new OL_KERNEL_QUERY_*s, each with a specific argument type.

jhuber6 · 2025-07-01T12:59:25Z

We already have an internal kernel type, don't we? Likely we just need something similar to the set / get attributes for kernels that CUDA / HSA provide. Though I suppose that this is just another level because most would probably be at the device level? Like I said, HSA already does this so there's some precedent.

RossBrunton · 2025-07-01T15:21:53Z

@jhuber6 RossBrunton@a8d1be8 Just so we're talking about the same thing, I've made a commit containing what I think you're suggesting.

I'm still not a fan of this approach - it's very different to how we've been handling other entry points and can't take advantage of our tablegen validation/loader layer code generation.

Of course, if other LLVM people prefer this approach, I'm happy to be outvoted.

RossBrunton · 2025-07-09T15:48:56Z

@jhuber6 For some context, I'm looking to implement urKernelGetSuggestedLocalWorkSize ( https://oneapi-src.github.io/unified-runtime/core/api.html#urkernelgetsuggestedlocalworksize ) which does a lot of magic but for the Cuda backend boils down to a call to cuOccupancyMaxPotentialBlockSize. This function basically takes in a kernel and shared memory size (i.e. dynamic memory used), and spits out the maximum number of work items that can fit on the device. Thus an equivalent offload API would also need to take in a kernel and memory size, meaning it can't use the normal (hypothetical) olGetKernelInfo interface.

Unless we want to rethink how we store kernels, I think a dedicated olGetKernelMaxGroupSize function is the best option.

RossBrunton · 2025-07-25T12:55:59Z

@jhuber6 @callumfare This has been re-written with the new API changes, mind having a look?

jhuber6

I know we've probably covered this before but I need to reorient myself here. But what was the issue with something like this again?

olGetSymbolInfo(symbol_t symbol, attribute attr, void *data);

uint32_t size;
symbol_t sym = olGetSymbol("foo");
olGetSymbolinfo(sym, OL_SYMBOL_INFO_MAX_GROUP_SIZE, &size);

RossBrunton · 2025-07-25T14:10:20Z

@jhuber6 The max work group size is dependant on the amount of dynamic memory the kernel will launch with, so we need to have that passed in as well.

jhuber6 · 2025-07-25T14:19:49Z

@jhuber6 The max work group size is dependant on the amount of dynamic memory the kernel will launch with, so we need to have that passed in as well.

Does it? OpenCL has https://registry.khronos.org/OpenCL/sdk/3.0/docs/man/html/clGetDeviceInfo.html which supports CL_DEVICE_MAX_WORK_GROUP_SIZE. The descriptions states

Maximum number of work-items in a work-group that a device is capable of executing on a single compute unit, for any given kernel-instance running on the device. (Refer also to clEnqueueNDRangeKernel and CL_KERNEL_WORK_GROUP_SIZE ). The minimum value is 1. The returned value is an upper limit and will not necessarily maximize performance. This maximum may be larger than supported by a specific kernel (refer to the CL_KERNEL_WORK_GROUP_SIZE query of clGetKernelWorkGroupInfo).

So, why can't we do the same?

RossBrunton · 2025-07-25T15:04:07Z

@jhuber6 I think a more appropriate comparison would be the cuOccupancyMaxPotentialBlockSize CUDA function or clGetKernelSuggestedLocalWorkSizeKHR. As I understand it, CL_DEVICE_MAX_WORK_GROUP_SIZE is the maximum work group size for any kernel, and specific kernels that require a lot of resources may have a smaller limit.

I'm looking to implement urKernelGetSuggestedLocalWorkSize on top of liboffload. The cuda UR backend uses cuOccupancyMaxPotentialBlockSize, so I wanted to implement a way to expose that function through liboffload.

jhuber6 · 2025-07-25T23:19:24Z

I guess my main concern is that this function is pretty specific, that function CUDA provides is based off of their own heuristic. I think it's correct to pass more threads with whatever dynamic shared memory usage, but it changed performance? I'm honestly wondering if this wouldn't be better solved by a custom option to the kernel launch instead.

RossBrunton · 2025-08-04T14:59:40Z

I want to expose the number directly to UR (I guess so users can try out different values for local memory and see which works best). I don't think having a launch option for "determine the best option" would fix this (although I don't disagree with the existence of such a function entirely).

It's currently specific to Nvidia, but an implementation can be made for AMD (as it is in HIP) and users will probably appreciate having it available in the liboffload library rather than having to roll it themselves.

RossBrunton · 2025-08-18T10:23:08Z

@jhuber6 This has been sitting here for a while now, and I'd like to either merge it or close it. I think practically speaking it has to be it's own function different from the getKernelInfo function because of the dynamic size requirement.

I think this makes sense to add to the liboffload API. There's an equavilent API in cuda which seems to be commonly used, and providing it in liboffload (and for AMD when it gets implemented) will make porting code easier.

jhuber6

I suppose it's useful in the name of portability, is it the maximum group size or just the optimal one.

This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal value of `1`. Co-Authored-By: Callum Fare <[email protected]>

github-actions · 2025-08-19T10:53:15Z

✅ With the latest revision this PR passed the C/C++ code formatter.

RossBrunton · 2025-08-19T10:57:28Z

@jhuber6 I've taken another look and done another revamp (sorry I'm being so messy about this). Hopefully the new name and description clear it up, but as far as I can tell the optimal size is the max size.

llvmbot added backend:AMDGPU offload labels Jun 5, 2025

RossBrunton commented Jun 5, 2025

View reviewed changes

offload/liboffload/API/Kernel.td Outdated Show resolved Hide resolved

jhuber6 reviewed Jun 5, 2025

View reviewed changes

RossBrunton force-pushed the maxSize branch from 10530f5 to 8fafdca Compare June 10, 2025 14:37

RossBrunton changed the title ~~[Offload] Add olKernelMaxGroupSize~~ [Offload] Add olGetKernelMaxGroupSize Jun 10, 2025

RossBrunton force-pushed the maxSize branch from ccec53f to 1b0f3d6 Compare June 30, 2025 15:36

RossBrunton force-pushed the maxSize branch from 1b0f3d6 to 4bdf748 Compare July 15, 2025 14:43

RossBrunton requested review from callumfare and jhuber6 July 15, 2025 14:45

jhuber6 reviewed Jul 25, 2025

View reviewed changes

jhuber6 reviewed Aug 18, 2025

View reviewed changes

RossBrunton and others added 3 commits August 19, 2025 11:34

[Offload] Add olKernelMaxGroupSize

ec7366c

This is equivalent to `cuOccupancyMaxPotentialBlockSize`. It is currently only implented on Cuda; AMDGPU and Host return the legal-but-suboptimal value of `1`. Co-Authored-By: Callum Fare <[email protected]>

Use uint64_t rather than size_t

ae21468

Rename function and make optional

be8379c

RossBrunton force-pushed the maxSize branch from 4bdf748 to be8379c Compare August 19, 2025 10:50

RossBrunton changed the title ~~[Offload] Add olGetKernelMaxGroupSize~~ [Offload] Add olCalculateMaxOccupancy Aug 19, 2025

Rename again

f646b3e

RossBrunton changed the title ~~[Offload] Add olCalculateMaxOccupancy~~ [Offload] Add olCalculateOptimalOccupancy Aug 19, 2025

jhuber6 approved these changes Aug 19, 2025

View reviewed changes

callumfare approved these changes Aug 19, 2025

View reviewed changes

RossBrunton merged commit 2c11a83 into llvm:main Aug 19, 2025
9 checks passed

[Offload] Add olCalculateOptimalOccupancy #142950

[Offload] Add olCalculateOptimalOccupancy #142950

Uh oh!

Conversation

RossBrunton commented Jun 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

jhuber6 Jun 5, 2025

Choose a reason for hiding this comment

Uh oh!

RossBrunton Jun 5, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

RossBrunton commented Jun 6, 2025

Uh oh!

callumfare commented Jun 9, 2025

Uh oh!

RossBrunton commented Jun 10, 2025

Uh oh!

RossBrunton commented Jun 16, 2025

Uh oh!

jhuber6 commented Jun 16, 2025

Uh oh!

RossBrunton commented Jun 18, 2025

Uh oh!

RossBrunton commented Jul 1, 2025

Uh oh!

jhuber6 commented Jul 1, 2025

Uh oh!

RossBrunton commented Jul 1, 2025

Uh oh!

jhuber6 commented Jul 1, 2025

Uh oh!

RossBrunton commented Jul 1, 2025

Uh oh!

RossBrunton commented Jul 9, 2025

Uh oh!

RossBrunton commented Jul 25, 2025

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

RossBrunton commented Jul 25, 2025

Uh oh!

jhuber6 commented Jul 25, 2025

Uh oh!

RossBrunton commented Jul 25, 2025

Uh oh!

jhuber6 commented Jul 25, 2025

Uh oh!

RossBrunton commented Aug 4, 2025

Uh oh!

RossBrunton commented Aug 18, 2025

Uh oh!

jhuber6 left a comment

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Aug 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

RossBrunton commented Aug 19, 2025

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

RossBrunton commented Jun 5, 2025 •

edited

Loading

llvmbot commented Jun 5, 2025 •

edited

Loading

github-actions bot commented Aug 19, 2025 •

edited

Loading