From 883393cd04c69dd8dcde5f9354ca94f90d9cdc48 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 30 May 2025 12:57:54 +0100 Subject: [PATCH 1/2] [Offload] Add olLaunchKernelSuggestedGroupSize This adds a new entrypoint `olLaunchKernelSuggestedGroupSize` which launches a kernel without specifying a work group size. Implementations will use internal device specific magic to determine an ideal work group size. ... Eventually anyway, for this change it is just hardcoded as `{1, 1, 1}`. --- offload/liboffload/API/Kernel.td | 35 ++++++++ .../liboffload/include/generated/OffloadAPI.h | 71 +++++++++++++++++ .../include/generated/OffloadEntryPoints.inc | 79 +++++++++++++++++++ .../include/generated/OffloadFuncs.inc | 2 + .../generated/OffloadImplFuncDecls.inc | 6 ++ .../include/generated/OffloadPrint.hpp | 51 ++++++++++++ offload/liboffload/src/OffloadImpl.cpp | 63 +++++++++++---- 7 files changed, 293 insertions(+), 14 deletions(-) diff --git a/offload/liboffload/API/Kernel.td b/offload/liboffload/API/Kernel.td index 247f9c1bf5b6a..2ff75aa2ba002 100644 --- a/offload/liboffload/API/Kernel.td +++ b/offload/liboffload/API/Kernel.td @@ -59,3 +59,38 @@ def : Function { Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>, ]; } + + +def : Struct { + let name = "ol_kernel_launch_size_suggested_args_t"; + let desc = "Size-related arguments for a kernel launch."; + let members = [ + StructMember<"size_t", "Dimensions", "Number of work dimensions">, + StructMember<"size_t", "NumItemsX", "Number of work items on the X dimension">, + StructMember<"size_t", "NumItemsY", "Number of work items on the Y dimension">, + StructMember<"size_t", "NumItemsZ", "Number of work items on the Z dimension">, + StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes."> + ]; +} + +def : Function { + let name = "olLaunchKernelSuggestedGroupSize"; + let desc = "Enqueue a kernel launch with the specified work items and parameters."; + let details = [ + "Behaves the same as olLaunchKernel, but the implementation automatically determines optimal work group sizes" + ]; + let params = [ + Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>, + Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>, + Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>, + Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>, + Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>, + Param<"const ol_kernel_launch_size_suggested_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>, + Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL> + ]; + let returns = [ + Return<"OL_ERRC_INVALID_ARGUMENT", ["`Queue == NULL && EventOut != NULL`"]>, + Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>, + Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>, + ]; +} diff --git a/offload/liboffload/include/generated/OffloadAPI.h b/offload/liboffload/include/generated/OffloadAPI.h index a1d7540519e32..1752340615a82 100644 --- a/offload/liboffload/include/generated/OffloadAPI.h +++ b/offload/liboffload/include/generated/OffloadAPI.h @@ -723,6 +723,54 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel( // [out][optional] optional recorded event for the enqueued operation ol_event_handle_t *EventOut); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Size-related arguments for a kernel launch. +typedef struct ol_kernel_launch_size_suggested_args_t { + size_t Dimensions; /// Number of work dimensions + size_t NumItemsX; /// Number of work items on the X dimension + size_t NumItemsY; /// Number of work items on the Y dimension + size_t NumItemsZ; /// Number of work items on the Z dimension + size_t DynSharedMemory; /// Size of dynamic shared memory in bytes. +} ol_kernel_launch_size_suggested_args_t; + +/////////////////////////////////////////////////////////////////////////////// +/// @brief Enqueue a kernel launch with the specified work items and parameters. +/// +/// @details +/// - Behaves the same as olLaunchKernel, but the implementation +/// automatically determines optimal work group sizes +/// +/// @returns +/// - ::OL_RESULT_SUCCESS +/// - ::OL_ERRC_UNINITIALIZED +/// - ::OL_ERRC_DEVICE_LOST +/// - ::OL_ERRC_INVALID_ARGUMENT +/// + `Queue == NULL && EventOut != NULL` +/// - ::OL_ERRC_INVALID_ARGUMENT +/// + `ArgumentsSize > 0 && ArgumentsData == NULL` +/// - ::OL_ERRC_INVALID_DEVICE +/// + If Queue is non-null but does not belong to Device +/// - ::OL_ERRC_INVALID_NULL_HANDLE +/// + `NULL == Device` +/// + `NULL == Kernel` +/// - ::OL_ERRC_INVALID_NULL_POINTER +/// + `NULL == LaunchSizeArgs` +OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize( + // [in][optional] handle of the queue + ol_queue_handle_t Queue, + // [in] handle of the device to execute on + ol_device_handle_t Device, + // [in] handle of the kernel + ol_kernel_handle_t Kernel, + // [in][optional] pointer to the kernel argument struct + const void *ArgumentsData, + // [in] size of the kernel argument struct + size_t ArgumentsSize, + // [in] pointer to the struct containing launch size parameters + const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs, + // [out][optional] optional recorded event for the enqueued operation + ol_event_handle_t *EventOut); + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for olGetPlatformInfo /// @details Each entry is a pointer to the parameter passed to the function; @@ -874,6 +922,19 @@ typedef struct ol_launch_kernel_params_t { ol_event_handle_t **pEventOut; } ol_launch_kernel_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for olLaunchKernelSuggestedGroupSize +/// @details Each entry is a pointer to the parameter passed to the function; +typedef struct ol_launch_kernel_suggested_group_size_params_t { + ol_queue_handle_t *pQueue; + ol_device_handle_t *pDevice; + ol_kernel_handle_t *pKernel; + const void **pArgumentsData; + size_t *pArgumentsSize; + const ol_kernel_launch_size_suggested_args_t **pLaunchSizeArgs; + ol_event_handle_t **pEventOut; +} ol_launch_kernel_suggested_group_size_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Variant of olInit that also sets source code location information /// @details See also ::olInit @@ -1016,6 +1077,16 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelWithCodeLoc( const ol_kernel_launch_size_args_t *LaunchSizeArgs, ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Variant of olLaunchKernelSuggestedGroupSize that also sets source +/// code location information +/// @details See also ::olLaunchKernelSuggestedGroupSize +OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSizeWithCodeLoc( + ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, + const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs, + ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation); + #if defined(__cplusplus) } // extern "C" #endif diff --git a/offload/liboffload/include/generated/OffloadEntryPoints.inc b/offload/liboffload/include/generated/OffloadEntryPoints.inc index 9feebeea09ec3..3b7c8be609c92 100644 --- a/offload/liboffload/include/generated/OffloadEntryPoints.inc +++ b/offload/liboffload/include/generated/OffloadEntryPoints.inc @@ -901,3 +901,82 @@ ol_result_t olLaunchKernelWithCodeLoc( currentCodeLocation() = nullptr; return Result; } + +/////////////////////////////////////////////////////////////////////////////// +llvm::Error olLaunchKernelSuggestedGroupSize_val( + ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, + const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs, + ol_event_handle_t *EventOut) { + if (offloadConfig().ValidationEnabled) { + if (Queue == NULL && EventOut != NULL) { + return createOffloadError( + error::ErrorCode::INVALID_ARGUMENT, + "validation failure: Queue == NULL && EventOut != NULL"); + } + + if (ArgumentsSize > 0 && ArgumentsData == NULL) { + return createOffloadError( + error::ErrorCode::INVALID_ARGUMENT, + "validation failure: ArgumentsSize > 0 && ArgumentsData == NULL"); + } + + if (NULL == Device) { + return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE, + "validation failure: NULL == Device"); + } + + if (NULL == Kernel) { + return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE, + "validation failure: NULL == Kernel"); + } + + if (NULL == LaunchSizeArgs) { + return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER, + "validation failure: NULL == LaunchSizeArgs"); + } + } + + return llvm::offload::olLaunchKernelSuggestedGroupSize_impl( + Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs, + EventOut); +} +OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize( + ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, + const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs, + ol_event_handle_t *EventOut) { + if (offloadConfig().TracingEnabled) { + llvm::errs() << "---> olLaunchKernelSuggestedGroupSize"; + } + + ol_result_t Result = + llvmErrorToOffloadError(olLaunchKernelSuggestedGroupSize_val( + Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs, + EventOut)); + + if (offloadConfig().TracingEnabled) { + ol_launch_kernel_suggested_group_size_params_t Params = { + &Queue, &Device, &Kernel, &ArgumentsData, + &ArgumentsSize, &LaunchSizeArgs, &EventOut}; + llvm::errs() << "(" << &Params << ")"; + llvm::errs() << "-> " << Result << "\n"; + if (Result && Result->Details) { + llvm::errs() << " *Error Details* " << Result->Details << " \n"; + } + } + return Result; +} +ol_result_t olLaunchKernelSuggestedGroupSizeWithCodeLoc( + ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, + const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs, + ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) { + currentCodeLocation() = CodeLocation; + ol_result_t Result = ::olLaunchKernelSuggestedGroupSize( + Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs, + EventOut); + + currentCodeLocation() = nullptr; + return Result; +} diff --git a/offload/liboffload/include/generated/OffloadFuncs.inc b/offload/liboffload/include/generated/OffloadFuncs.inc index 78ff9ddb82799..48a1c73dad631 100644 --- a/offload/liboffload/include/generated/OffloadFuncs.inc +++ b/offload/liboffload/include/generated/OffloadFuncs.inc @@ -29,6 +29,7 @@ OFFLOAD_FUNC(olCreateProgram) OFFLOAD_FUNC(olDestroyProgram) OFFLOAD_FUNC(olGetKernel) OFFLOAD_FUNC(olLaunchKernel) +OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSize) OFFLOAD_FUNC(olInitWithCodeLoc) OFFLOAD_FUNC(olShutDownWithCodeLoc) OFFLOAD_FUNC(olGetPlatformInfoWithCodeLoc) @@ -48,5 +49,6 @@ OFFLOAD_FUNC(olCreateProgramWithCodeLoc) OFFLOAD_FUNC(olDestroyProgramWithCodeLoc) OFFLOAD_FUNC(olGetKernelWithCodeLoc) OFFLOAD_FUNC(olLaunchKernelWithCodeLoc) +OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSizeWithCodeLoc) #undef OFFLOAD_FUNC diff --git a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc index 71d25dee87867..d8c94e59182bc 100644 --- a/offload/liboffload/include/generated/OffloadImplFuncDecls.inc +++ b/offload/liboffload/include/generated/OffloadImplFuncDecls.inc @@ -58,3 +58,9 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, size_t ArgumentsSize, const ol_kernel_launch_size_args_t *LaunchSizeArgs, ol_event_handle_t *EventOut); + +Error olLaunchKernelSuggestedGroupSize_impl( + ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, + const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs, + ol_event_handle_t *EventOut); diff --git a/offload/liboffload/include/generated/OffloadPrint.hpp b/offload/liboffload/include/generated/OffloadPrint.hpp index 3aad6223d4dea..706f45987e662 100644 --- a/offload/liboffload/include/generated/OffloadPrint.hpp +++ b/offload/liboffload/include/generated/OffloadPrint.hpp @@ -392,6 +392,31 @@ operator<<(llvm::raw_ostream &os, os << "}"; return os; } +/////////////////////////////////////////////////////////////////////////////// +/// @brief Print operator for the ol_kernel_launch_size_suggested_args_t type +/// @returns llvm::raw_ostream & + +inline llvm::raw_ostream & +operator<<(llvm::raw_ostream &os, + const struct ol_kernel_launch_size_suggested_args_t params) { + os << "(struct ol_kernel_launch_size_suggested_args_t){"; + os << ".Dimensions = "; + os << params.Dimensions; + os << ", "; + os << ".NumItemsX = "; + os << params.NumItemsX; + os << ", "; + os << ".NumItemsY = "; + os << params.NumItemsY; + os << ", "; + os << ".NumItemsZ = "; + os << params.NumItemsZ; + os << ", "; + os << ".DynSharedMemory = "; + os << params.DynSharedMemory; + os << "}"; + return os; +} inline llvm::raw_ostream & operator<<(llvm::raw_ostream &os, @@ -619,6 +644,32 @@ operator<<(llvm::raw_ostream &os, return os; } +inline llvm::raw_ostream &operator<<( + llvm::raw_ostream &os, + const struct ol_launch_kernel_suggested_group_size_params_t *params) { + os << ".Queue = "; + printPtr(os, *params->pQueue); + os << ", "; + os << ".Device = "; + printPtr(os, *params->pDevice); + os << ", "; + os << ".Kernel = "; + printPtr(os, *params->pKernel); + os << ", "; + os << ".ArgumentsData = "; + printPtr(os, *params->pArgumentsData); + os << ", "; + os << ".ArgumentsSize = "; + os << *params->pArgumentsSize; + os << ", "; + os << ".LaunchSizeArgs = "; + printPtr(os, *params->pLaunchSizeArgs); + os << ", "; + os << ".EventOut = "; + printPtr(os, *params->pEventOut); + return os; +} + /////////////////////////////////////////////////////////////////////////////// // @brief Print pointer value template diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp index 7b67cbba43e68..8a57afa8522c5 100644 --- a/offload/liboffload/src/OffloadImpl.cpp +++ b/offload/liboffload/src/OffloadImpl.cpp @@ -484,11 +484,10 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName, return Error::success(); } -Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, - ol_kernel_handle_t Kernel, const void *ArgumentsData, - size_t ArgumentsSize, - const ol_kernel_launch_size_args_t *LaunchSizeArgs, - ol_event_handle_t *EventOut) { +namespace { +Error do_launch(ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, KernelArgsTy &Args, + ol_event_handle_t *EventOut) { auto *DeviceImpl = Device->Device; if (Queue && Device != Queue->Device) { return createOffloadError( @@ -498,6 +497,26 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr; AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl); + auto *KernelImpl = reinterpret_cast(Kernel); + auto Err = KernelImpl->launch(*DeviceImpl, Args.ArgPtrs, nullptr, Args, + AsyncInfoWrapper); + + AsyncInfoWrapper.finalize(Err); + if (Err) + return Err; + + if (EventOut) + *EventOut = makeEvent(Queue); + + return Error::success(); +} +} // namespace + +Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, const void *ArgumentsData, + size_t ArgumentsSize, + const ol_kernel_launch_size_args_t *LaunchSizeArgs, + ol_event_handle_t *EventOut) { KernelArgsTy LaunchArgs{}; LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX; LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY; @@ -514,18 +533,34 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device, // Don't do anything with pointer indirection; use arg data as-is LaunchArgs.Flags.IsCUDA = true; - auto *KernelImpl = reinterpret_cast(Kernel); - auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr, - LaunchArgs, AsyncInfoWrapper); + return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut); +} - AsyncInfoWrapper.finalize(Err); - if (Err) - return Err; +Error olLaunchKernelSuggestedGroupSize_impl( + ol_queue_handle_t Queue, ol_device_handle_t Device, + ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize, + const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs, + ol_event_handle_t *EventOut) { + // TODO: Use backend specific magic to determine the best work group size + size_t PreferredSize[3] = {1, 1, 1}; - if (EventOut) - *EventOut = makeEvent(Queue); + KernelArgsTy LaunchArgs{}; + LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumItemsX / PreferredSize[0]; + LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumItemsY / PreferredSize[1]; + LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumItemsZ / PreferredSize[2]; + LaunchArgs.ThreadLimit[0] = PreferredSize[0]; + LaunchArgs.ThreadLimit[1] = PreferredSize[1]; + LaunchArgs.ThreadLimit[2] = PreferredSize[2]; + LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory; - return Error::success(); + KernelLaunchParamsTy Params; + Params.Data = const_cast(ArgumentsData); + Params.Size = ArgumentsSize; + LaunchArgs.ArgPtrs = reinterpret_cast(&Params); + // Don't do anything with pointer indirection; use arg data as-is + LaunchArgs.Flags.IsCUDA = true; + + return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut); } } // namespace offload From ba29f6964352ddb8eae8a0a9f3fbda1be8afe8c5 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Mon, 2 Jun 2025 15:46:28 +0100 Subject: [PATCH 2/2] Add test --- offload/unittests/OffloadAPI/CMakeLists.txt | 1 + .../unittests/OffloadAPI/device_code/foo.c | 8 +- .../olLaunchKernelSuggestedGroupSize.cpp | 114 ++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp diff --git a/offload/unittests/OffloadAPI/CMakeLists.txt b/offload/unittests/OffloadAPI/CMakeLists.txt index 54b5c4e245e62..9a8a53aaa98a8 100644 --- a/offload/unittests/OffloadAPI/CMakeLists.txt +++ b/offload/unittests/OffloadAPI/CMakeLists.txt @@ -20,6 +20,7 @@ add_offload_unittest("offload.unittests" ${CMAKE_CURRENT_SOURCE_DIR}/program/olDestroyProgram.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel/olGetKernel.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel/olLaunchKernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel/olLaunchKernelSuggestedGroupSize.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event/olDestroyEvent.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event/olWaitEvent.cpp ) diff --git a/offload/unittests/OffloadAPI/device_code/foo.c b/offload/unittests/OffloadAPI/device_code/foo.c index 83cdc53cddd8d..ffe1a71a0719b 100644 --- a/offload/unittests/OffloadAPI/device_code/foo.c +++ b/offload/unittests/OffloadAPI/device_code/foo.c @@ -2,5 +2,11 @@ #include __gpu_kernel void foo(uint32_t *out) { - out[__gpu_thread_id(0)] = __gpu_thread_id(0); + int x = __gpu_block_id(0) * __gpu_num_threads(0) + __gpu_thread_id(0); + int xw = __gpu_num_blocks(0) * __gpu_num_threads(0); + int y = __gpu_block_id(1) * __gpu_num_threads(1) + __gpu_thread_id(1); + int yw = __gpu_num_blocks(1) * __gpu_num_threads(1); + int z = __gpu_block_id(2) * __gpu_num_threads(2) + __gpu_thread_id(2); + int offset = (z * yw * xw) + (y * xw) + x; + out[offset] = offset; } diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp new file mode 100644 index 0000000000000..5c13351b1066d --- /dev/null +++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernelSuggestedGroupSize.cpp @@ -0,0 +1,114 @@ +//===------- Offload API tests - olLaunchKernelSuggestedGroupSize ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "../common/Fixtures.hpp" +#include +#include + +static constexpr uint32_t COMBOS[6][4] = { + {1, 64, 1, 1}, {1, 63, 1, 1}, {2, 64, 64, 1}, + {2, 40, 40, 1}, {3, 64, 64, 64}, {3, 128, 20, 12}, +}; + +struct olLaunchKernelSuggestedGroupSizeTest : OffloadQueueTest { + void SetUp() override { + RETURN_ON_FATAL_FAILURE(OffloadQueueTest::SetUp()); + ASSERT_TRUE(TestEnvironment::loadDeviceBinary("foo", Device, DeviceBin)); + ASSERT_GE(DeviceBin->getBufferSize(), 0lu); + ASSERT_SUCCESS(olCreateProgram(Device, DeviceBin->getBufferStart(), + DeviceBin->getBufferSize(), &Program)); + ASSERT_SUCCESS(olGetKernel(Program, "foo", &Kernel)); + } + + void TearDown() override { + if (Program) { + olDestroyProgram(Program); + } + RETURN_ON_FATAL_FAILURE(OffloadQueueTest::TearDown()); + } + + std::unique_ptr DeviceBin; + ol_program_handle_t Program = nullptr; + ol_kernel_handle_t Kernel = nullptr; +}; + +OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olLaunchKernelSuggestedGroupSizeTest); + +TEST_P(olLaunchKernelSuggestedGroupSizeTest, Success) { + for (auto C : COMBOS) { + std::string scope{}; + llvm::raw_string_ostream os{scope}; + os << "{ " << C[0] << ", " << C[1] << ", " << C[2] << ", " << C[3] << "}"; + os.flush(); + SCOPED_TRACE(scope); + + auto NumItems = C[1] * C[2] * C[3]; + + ol_kernel_launch_size_suggested_args_t LaunchArgs{}; + LaunchArgs.Dimensions = C[0]; + LaunchArgs.NumItemsX = C[1]; + LaunchArgs.NumItemsY = C[2]; + LaunchArgs.NumItemsZ = C[3]; + + void *Mem; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, + NumItems * sizeof(int), &Mem)); + struct { + void *Mem; + } Args{Mem}; + + ASSERT_SUCCESS(olLaunchKernelSuggestedGroupSize( + Queue, Device, Kernel, &Args, sizeof(Args), &LaunchArgs, nullptr)); + + ASSERT_SUCCESS(olWaitQueue(Queue)); + + int *Data = (int *)Mem; + for (int i = 0; i < static_cast(NumItems); i++) { + ASSERT_EQ(Data[i], i); + } + + ASSERT_SUCCESS(olMemFree(Mem)); + } +} + +TEST_P(olLaunchKernelSuggestedGroupSizeTest, SuccessSynchronous) { + for (auto C : COMBOS) { + std::string scope{}; + llvm::raw_string_ostream os{scope}; + os << "{ " << C[0] << ", " << C[1] << ", " << C[2] << ", " << C[3] << "}"; + os.flush(); + SCOPED_TRACE(scope); + + auto NumItems = C[1] * C[2] * C[3]; + + ol_kernel_launch_size_suggested_args_t LaunchArgs{}; + LaunchArgs.Dimensions = C[0]; + LaunchArgs.NumItemsX = C[1]; + LaunchArgs.NumItemsY = C[2]; + LaunchArgs.NumItemsZ = C[3]; + + void *Mem; + ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, + NumItems * sizeof(int), &Mem)); + struct { + void *Mem; + } Args{Mem}; + + ASSERT_SUCCESS(olLaunchKernelSuggestedGroupSize( + nullptr, Device, Kernel, &Args, sizeof(Args), &LaunchArgs, nullptr)); + + ASSERT_SUCCESS(olWaitQueue(Queue)); + + int *Data = (int *)Mem; + for (int i = 0; i < static_cast(NumItems); i++) { + ASSERT_EQ(Data[i], i); + } + + ASSERT_SUCCESS(olMemFree(Mem)); + } +}