From 883393cd04c69dd8dcde5f9354ca94f90d9cdc48 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Fri, 30 May 2025 12:57:54 +0100
Subject: [PATCH 1/2] [Offload] Add olLaunchKernelSuggestedGroupSize

This adds a new entrypoint `olLaunchKernelSuggestedGroupSize` which
launches a kernel without specifying a work group size. Implementations
will use internal device specific magic to determine an ideal work group
size.

... Eventually anyway, for this change it is just hardcoded as
`{1, 1, 1}`.
---
 offload/liboffload/API/Kernel.td              | 35 ++++++++
 .../liboffload/include/generated/OffloadAPI.h | 71 +++++++++++++++++
 .../include/generated/OffloadEntryPoints.inc  | 79 +++++++++++++++++++
 .../include/generated/OffloadFuncs.inc        |  2 +
 .../generated/OffloadImplFuncDecls.inc        |  6 ++
 .../include/generated/OffloadPrint.hpp        | 51 ++++++++++++
 offload/liboffload/src/OffloadImpl.cpp        | 63 +++++++++++----
 7 files changed, 293 insertions(+), 14 deletions(-)

diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td
index 247f9c1bf5b6a..2ff75aa2ba002 100644
--- a/offload/liboffload/API/Kernel.td
+++ b/offload/liboffload/API/Kernel.td
@@ -59,3 +59,38 @@ def : Function {
         Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
     ];
 }
+
+
+def : Struct {
+    let name = "ol_kernel_launch_size_suggested_args_t";
+    let desc = "Size-related arguments for a kernel launch.";
+    let members = [
+        StructMember<"size_t", "Dimensions", "Number of work dimensions">,
+        StructMember<"size_t", "NumItemsX", "Number of work items on the X dimension">,
+        StructMember<"size_t", "NumItemsY", "Number of work items on the Y dimension">,
+        StructMember<"size_t", "NumItemsZ", "Number of work items on the Z dimension">,
+        StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes.">
+    ];
+}
+
+def : Function {
+    let name = "olLaunchKernelSuggestedGroupSize";
+    let desc = "Enqueue a kernel launch with the specified work items and parameters.";
+    let details = [
+        "Behaves the same as olLaunchKernel, but the implementation automatically determines optimal work group sizes"
+    ];
+    let params = [
+        Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
+        Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
+        Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
+        Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
+        Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
+        Param<"const ol_kernel_launch_size_suggested_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
+        Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
+    ];
+    let returns = [
+        Return<"OL_ERRC_INVALID_ARGUMENT", ["`Queue == NULL && EventOut != NULL`"]>,
+        Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
+        Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
+    ];
+}
diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h
index a1d7540519e32..1752340615a82 100644
--- a/offload/liboffload/include/generated/OffloadAPI.h
+++ b/offload/liboffload/include/generated/OffloadAPI.h
@@ -723,6 +723,54 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel(
     // [out][optional] optional recorded event for the enqueued operation
     ol_event_handle_t *EventOut);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Size-related arguments for a kernel launch.
+typedef struct ol_kernel_launch_size_suggested_args_t {
+  size_t Dimensions;      /// Number of work dimensions
+  size_t NumItemsX;       /// Number of work items on the X dimension
+  size_t NumItemsY;       /// Number of work items on the Y dimension
+  size_t NumItemsZ;       /// Number of work items on the Z dimension
+  size_t DynSharedMemory; /// Size of dynamic shared memory in bytes.
+} ol_kernel_launch_size_suggested_args_t;
+
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Enqueue a kernel launch with the specified work items and parameters.
+///
+/// @details
+///    - Behaves the same as olLaunchKernel, but the implementation
+///    automatically determines optimal work group sizes
+///
+/// @returns
+///     - ::OL_RESULT_SUCCESS
+///     - ::OL_ERRC_UNINITIALIZED
+///     - ::OL_ERRC_DEVICE_LOST
+///     - ::OL_ERRC_INVALID_ARGUMENT
+///         + `Queue == NULL && EventOut != NULL`
+///     - ::OL_ERRC_INVALID_ARGUMENT
+///         + `ArgumentsSize > 0 && ArgumentsData == NULL`
+///     - ::OL_ERRC_INVALID_DEVICE
+///         + If Queue is non-null but does not belong to Device
+///     - ::OL_ERRC_INVALID_NULL_HANDLE
+///         + `NULL == Device`
+///         + `NULL == Kernel`
+///     - ::OL_ERRC_INVALID_NULL_POINTER
+///         + `NULL == LaunchSizeArgs`
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
+    // [in][optional] handle of the queue
+    ol_queue_handle_t Queue,
+    // [in] handle of the device to execute on
+    ol_device_handle_t Device,
+    // [in] handle of the kernel
+    ol_kernel_handle_t Kernel,
+    // [in][optional] pointer to the kernel argument struct
+    const void *ArgumentsData,
+    // [in] size of the kernel argument struct
+    size_t ArgumentsSize,
+    // [in] pointer to the struct containing launch size parameters
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    // [out][optional] optional recorded event for the enqueued operation
+    ol_event_handle_t *EventOut);
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Function parameters for olGetPlatformInfo
 /// @details Each entry is a pointer to the parameter passed to the function;
@@ -874,6 +922,19 @@ typedef struct ol_launch_kernel_params_t {
   ol_event_handle_t **pEventOut;
 } ol_launch_kernel_params_t;
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Function parameters for olLaunchKernelSuggestedGroupSize
+/// @details Each entry is a pointer to the parameter passed to the function;
+typedef struct ol_launch_kernel_suggested_group_size_params_t {
+  ol_queue_handle_t *pQueue;
+  ol_device_handle_t *pDevice;
+  ol_kernel_handle_t *pKernel;
+  const void **pArgumentsData;
+  size_t *pArgumentsSize;
+  const ol_kernel_launch_size_suggested_args_t **pLaunchSizeArgs;
+  ol_event_handle_t **pEventOut;
+} ol_launch_kernel_suggested_group_size_params_t;
+
 ///////////////////////////////////////////////////////////////////////////////
 /// @brief Variant of olInit that also sets source code location information
 /// @details See also ::olInit
@@ -1016,6 +1077,16 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelWithCodeLoc(
     const ol_kernel_launch_size_args_t *LaunchSizeArgs,
     ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
 
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Variant of olLaunchKernelSuggestedGroupSize that also sets source
+/// code location information
+/// @details See also ::olLaunchKernelSuggestedGroupSize
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSizeWithCodeLoc(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
+
 #if defined(__cplusplus)
 } // extern "C"
 #endif
diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc
index 9feebeea09ec3..3b7c8be609c92 100644
--- a/offload/liboffload/include/generated/OffloadEntryPoints.inc
+++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc
@@ -901,3 +901,82 @@ ol_result_t olLaunchKernelWithCodeLoc(
   currentCodeLocation() = nullptr;
   return Result;
 }
+
+///////////////////////////////////////////////////////////////////////////////
+llvm::Error olLaunchKernelSuggestedGroupSize_val(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut) {
+  if (offloadConfig().ValidationEnabled) {
+    if (Queue == NULL && EventOut != NULL) {
+      return createOffloadError(
+          error::ErrorCode::INVALID_ARGUMENT,
+          "validation failure: Queue == NULL && EventOut != NULL");
+    }
+
+    if (ArgumentsSize > 0 && ArgumentsData == NULL) {
+      return createOffloadError(
+          error::ErrorCode::INVALID_ARGUMENT,
+          "validation failure: ArgumentsSize > 0 && ArgumentsData == NULL");
+    }
+
+    if (NULL == Device) {
+      return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
+                                "validation failure: NULL == Device");
+    }
+
+    if (NULL == Kernel) {
+      return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
+                                "validation failure: NULL == Kernel");
+    }
+
+    if (NULL == LaunchSizeArgs) {
+      return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER,
+                                "validation failure: NULL == LaunchSizeArgs");
+    }
+  }
+
+  return llvm::offload::olLaunchKernelSuggestedGroupSize_impl(
+      Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+      EventOut);
+}
+OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut) {
+  if (offloadConfig().TracingEnabled) {
+    llvm::errs() << "---> olLaunchKernelSuggestedGroupSize";
+  }
+
+  ol_result_t Result =
+      llvmErrorToOffloadError(olLaunchKernelSuggestedGroupSize_val(
+          Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+          EventOut));
+
+  if (offloadConfig().TracingEnabled) {
+    ol_launch_kernel_suggested_group_size_params_t Params = {
+        &Queue,         &Device,         &Kernel,  &ArgumentsData,
+        &ArgumentsSize, &LaunchSizeArgs, &EventOut};
+    llvm::errs() << "(" << &Params << ")";
+    llvm::errs() << "-> " << Result << "\n";
+    if (Result && Result->Details) {
+      llvm::errs() << "     *Error Details* " << Result->Details << " \n";
+    }
+  }
+  return Result;
+}
+ol_result_t olLaunchKernelSuggestedGroupSizeWithCodeLoc(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) {
+  currentCodeLocation() = CodeLocation;
+  ol_result_t Result = ::olLaunchKernelSuggestedGroupSize(
+      Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
+      EventOut);
+
+  currentCodeLocation() = nullptr;
+  return Result;
+}
diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc
index 78ff9ddb82799..48a1c73dad631 100644
--- a/offload/liboffload/include/generated/OffloadFuncs.inc
+++ b/offload/liboffload/include/generated/OffloadFuncs.inc
@@ -29,6 +29,7 @@ OFFLOAD_FUNC(olCreateProgram)
 OFFLOAD_FUNC(olDestroyProgram)
 OFFLOAD_FUNC(olGetKernel)
 OFFLOAD_FUNC(olLaunchKernel)
+OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSize)
 OFFLOAD_FUNC(olInitWithCodeLoc)
 OFFLOAD_FUNC(olShutDownWithCodeLoc)
 OFFLOAD_FUNC(olGetPlatformInfoWithCodeLoc)
@@ -48,5 +49,6 @@ OFFLOAD_FUNC(olCreateProgramWithCodeLoc)
 OFFLOAD_FUNC(olDestroyProgramWithCodeLoc)
 OFFLOAD_FUNC(olGetKernelWithCodeLoc)
 OFFLOAD_FUNC(olLaunchKernelWithCodeLoc)
+OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSizeWithCodeLoc)
 
 #undef OFFLOAD_FUNC
diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
index 71d25dee87867..d8c94e59182bc 100644
--- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
+++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc
@@ -58,3 +58,9 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
                           size_t ArgumentsSize,
                           const ol_kernel_launch_size_args_t *LaunchSizeArgs,
                           ol_event_handle_t *EventOut);
+
+Error olLaunchKernelSuggestedGroupSize_impl(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut);
diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp
index 3aad6223d4dea..706f45987e662 100644
--- a/offload/liboffload/include/generated/OffloadPrint.hpp
+++ b/offload/liboffload/include/generated/OffloadPrint.hpp
@@ -392,6 +392,31 @@ operator<<(llvm::raw_ostream &os,
   os << "}";
   return os;
 }
+///////////////////////////////////////////////////////////////////////////////
+/// @brief Print operator for the ol_kernel_launch_size_suggested_args_t type
+/// @returns llvm::raw_ostream &
+
+inline llvm::raw_ostream &
+operator<<(llvm::raw_ostream &os,
+           const struct ol_kernel_launch_size_suggested_args_t params) {
+  os << "(struct ol_kernel_launch_size_suggested_args_t){";
+  os << ".Dimensions = ";
+  os << params.Dimensions;
+  os << ", ";
+  os << ".NumItemsX = ";
+  os << params.NumItemsX;
+  os << ", ";
+  os << ".NumItemsY = ";
+  os << params.NumItemsY;
+  os << ", ";
+  os << ".NumItemsZ = ";
+  os << params.NumItemsZ;
+  os << ", ";
+  os << ".DynSharedMemory = ";
+  os << params.DynSharedMemory;
+  os << "}";
+  return os;
+}
 
 inline llvm::raw_ostream &
 operator<<(llvm::raw_ostream &os,
@@ -619,6 +644,32 @@ operator<<(llvm::raw_ostream &os,
   return os;
 }
 
+inline llvm::raw_ostream &operator<<(
+    llvm::raw_ostream &os,
+    const struct ol_launch_kernel_suggested_group_size_params_t *params) {
+  os << ".Queue = ";
+  printPtr(os, *params->pQueue);
+  os << ", ";
+  os << ".Device = ";
+  printPtr(os, *params->pDevice);
+  os << ", ";
+  os << ".Kernel = ";
+  printPtr(os, *params->pKernel);
+  os << ", ";
+  os << ".ArgumentsData = ";
+  printPtr(os, *params->pArgumentsData);
+  os << ", ";
+  os << ".ArgumentsSize = ";
+  os << *params->pArgumentsSize;
+  os << ", ";
+  os << ".LaunchSizeArgs = ";
+  printPtr(os, *params->pLaunchSizeArgs);
+  os << ", ";
+  os << ".EventOut = ";
+  printPtr(os, *params->pEventOut);
+  return os;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // @brief Print pointer value
 template <typename T>
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 7b67cbba43e68..8a57afa8522c5 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -484,11 +484,10 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
   return Error::success();
 }
 
-Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
-                          ol_kernel_handle_t Kernel, const void *ArgumentsData,
-                          size_t ArgumentsSize,
-                          const ol_kernel_launch_size_args_t *LaunchSizeArgs,
-                          ol_event_handle_t *EventOut) {
+namespace {
+Error do_launch(ol_queue_handle_t Queue, ol_device_handle_t Device,
+                ol_kernel_handle_t Kernel, KernelArgsTy &Args,
+                ol_event_handle_t *EventOut) {
   auto *DeviceImpl = Device->Device;
   if (Queue && Device != Queue->Device) {
     return createOffloadError(
@@ -498,6 +497,26 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
 
   auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
   AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
+  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
+  auto Err = KernelImpl->launch(*DeviceImpl, Args.ArgPtrs, nullptr, Args,
+                                AsyncInfoWrapper);
+
+  AsyncInfoWrapper.finalize(Err);
+  if (Err)
+    return Err;
+
+  if (EventOut)
+    *EventOut = makeEvent(Queue);
+
+  return Error::success();
+}
+} // namespace
+
+Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
+                          ol_kernel_handle_t Kernel, const void *ArgumentsData,
+                          size_t ArgumentsSize,
+                          const ol_kernel_launch_size_args_t *LaunchSizeArgs,
+                          ol_event_handle_t *EventOut) {
   KernelArgsTy LaunchArgs{};
   LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
   LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
@@ -514,18 +533,34 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
   // Don't do anything with pointer indirection; use arg data as-is
   LaunchArgs.Flags.IsCUDA = true;
 
-  auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
-  auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
-                                LaunchArgs, AsyncInfoWrapper);
+  return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
+}
 
-  AsyncInfoWrapper.finalize(Err);
-  if (Err)
-    return Err;
+Error olLaunchKernelSuggestedGroupSize_impl(
+    ol_queue_handle_t Queue, ol_device_handle_t Device,
+    ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
+    const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
+    ol_event_handle_t *EventOut) {
+  // TODO: Use backend specific magic to determine the best work group size
+  size_t PreferredSize[3] = {1, 1, 1};
 
-  if (EventOut)
-    *EventOut = makeEvent(Queue);
+  KernelArgsTy LaunchArgs{};
+  LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumItemsX / PreferredSize[0];
+  LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumItemsY / PreferredSize[1];
+  LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumItemsZ / PreferredSize[2];
+  LaunchArgs.ThreadLimit[0] = PreferredSize[0];
+  LaunchArgs.ThreadLimit[1] = PreferredSize[1];
+  LaunchArgs.ThreadLimit[2] = PreferredSize[2];
+  LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
 
-  return Error::success();
+  KernelLaunchParamsTy Params;
+  Params.Data = const_cast<void *>(ArgumentsData);
+  Params.Size = ArgumentsSize;
+  LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
+  // Don't do anything with pointer indirection; use arg data as-is
+  LaunchArgs.Flags.IsCUDA = true;
+
+  return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
 }
 
 } // namespace offload

From ba29f6964352ddb8eae8a0a9f3fbda1be8afe8c5 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Mon, 2 Jun 2025 15:46:28 +0100
Subject: [PATCH 2/2] Add test

---
 offload/unittests/OffloadAPI/CMakeLists.txt   |   1 +
 .../unittests/OffloadAPI/device_code/foo.c    |   8 +-
 .../olLaunchKernelSuggestedGroupSize.cpp      | 114 ++++++++++++++++++
 3 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp

diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt
index 54b5c4e245e62..9a8a53aaa98a8 100644
--- a/offload/unittests/OffloadAPI/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/CMakeLists.txt
@@ -20,6 +20,7 @@ add_offload_unittest("offload.unittests"
     ${CMAKE_CURRENT_SOURCE_DIR}/program/olDestroyProgram.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel/olGetKernel.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/kernel/olLaunchKernel.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/kernel/olLaunchKernelSuggestedGroupSize.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/event/olDestroyEvent.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/event/olWaitEvent.cpp
     )
diff --git a/offload/unittests/OffloadAPI/device_code/foo.c b/offload/unittests/OffloadAPI/device_code/foo.c
index 83cdc53cddd8d..ffe1a71a0719b 100644
--- a/offload/unittests/OffloadAPI/device_code/foo.c
+++ b/offload/unittests/OffloadAPI/device_code/foo.c
@@ -2,5 +2,11 @@
 #include <stdint.h>
 
 __gpu_kernel void foo(uint32_t *out) {
-  out[__gpu_thread_id(0)] = __gpu_thread_id(0);
+  int x = __gpu_block_id(0) * __gpu_num_threads(0) + __gpu_thread_id(0);
+  int xw = __gpu_num_blocks(0) * __gpu_num_threads(0);
+  int y = __gpu_block_id(1) * __gpu_num_threads(1) + __gpu_thread_id(1);
+  int yw = __gpu_num_blocks(1) * __gpu_num_threads(1);
+  int z = __gpu_block_id(2) * __gpu_num_threads(2) + __gpu_thread_id(2);
+  int offset = (z * yw * xw) + (y * xw) + x;
+  out[offset] = offset;
 }
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp
new file mode 100644
index 0000000000000..5c13351b1066d
--- /dev/null
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp
@@ -0,0 +1,114 @@
+//===------- Offload API tests - olLaunchKernelSuggestedGroupSize ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "../common/Fixtures.hpp"
+#include <OffloadAPI.h>
+#include <gtest/gtest.h>
+
+static constexpr uint32_t COMBOS[6][4] = {
+    {1, 64, 1, 1},  {1, 63, 1, 1},   {2, 64, 64, 1},
+    {2, 40, 40, 1}, {3, 64, 64, 64}, {3, 128, 20, 12},
+};
+
+struct olLaunchKernelSuggestedGroupSizeTest : OffloadQueueTest {
+  void SetUp() override {
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp());
+    ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin));
+    ASSERT_GE(DeviceBin->getBufferSize(), 0lu);
+    ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(),
+                                   DeviceBin->getBufferSize(), &Program));
+    ASSERT_SUCCESS(olGetKernel(Program, "foo", &Kernel));
+  }
+
+  void TearDown() override {
+    if (Program) {
+      olDestroyProgram(Program);
+    }
+    RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown());
+  }
+
+  std::unique_ptr<llvm::MemoryBuffer> DeviceBin;
+  ol_program_handle_t Program = nullptr;
+  ol_kernel_handle_t Kernel = nullptr;
+};
+
+OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelSuggestedGroupSizeTest);
+
+TEST_P(olLaunchKernelSuggestedGroupSizeTest, Success) {
+  for (auto C : COMBOS) {
+    std::string scope{};
+    llvm::raw_string_ostream os{scope};
+    os << "{ " << C[0] << ", " << C[1] << ", " << C[2] << ", " << C[3] << "}";
+    os.flush();
+    SCOPED_TRACE(scope);
+
+    auto NumItems = C[1] * C[2] * C[3];
+
+    ol_kernel_launch_size_suggested_args_t LaunchArgs{};
+    LaunchArgs.Dimensions = C[0];
+    LaunchArgs.NumItemsX = C[1];
+    LaunchArgs.NumItemsY = C[2];
+    LaunchArgs.NumItemsZ = C[3];
+
+    void *Mem;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                              NumItems * sizeof(int), &Mem));
+    struct {
+      void *Mem;
+    } Args{Mem};
+
+    ASSERT_SUCCESS(olLaunchKernelSuggestedGroupSize(
+        Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs, nullptr));
+
+    ASSERT_SUCCESS(olWaitQueue(Queue));
+
+    int *Data = (int *)Mem;
+    for (int i = 0; i < static_cast<int>(NumItems); i++) {
+      ASSERT_EQ(Data[i], i);
+    }
+
+    ASSERT_SUCCESS(olMemFree(Mem));
+  }
+}
+
+TEST_P(olLaunchKernelSuggestedGroupSizeTest, SuccessSynchronous) {
+  for (auto C : COMBOS) {
+    std::string scope{};
+    llvm::raw_string_ostream os{scope};
+    os << "{ " << C[0] << ", " << C[1] << ", " << C[2] << ", " << C[3] << "}";
+    os.flush();
+    SCOPED_TRACE(scope);
+
+    auto NumItems = C[1] * C[2] * C[3];
+
+    ol_kernel_launch_size_suggested_args_t LaunchArgs{};
+    LaunchArgs.Dimensions = C[0];
+    LaunchArgs.NumItemsX = C[1];
+    LaunchArgs.NumItemsY = C[2];
+    LaunchArgs.NumItemsZ = C[3];
+
+    void *Mem;
+    ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
+                              NumItems * sizeof(int), &Mem));
+    struct {
+      void *Mem;
+    } Args{Mem};
+
+    ASSERT_SUCCESS(olLaunchKernelSuggestedGroupSize(
+        nullptr, Device, Kernel, &Args, sizeof(Args), &LaunchArgs, nullptr));
+
+    ASSERT_SUCCESS(olWaitQueue(Queue));
+
+    int *Data = (int *)Mem;
+    for (int i = 0; i < static_cast<int>(NumItems); i++) {
+      ASSERT_EQ(Data[i], i);
+    }
+
+    ASSERT_SUCCESS(olMemFree(Mem));
+  }
+}