Skip to content

Commit 883393c

Browse files
committed
[Offload] Add olLaunchKernelSuggestedGroupSize
This adds a new entrypoint `olLaunchKernelSuggestedGroupSize` which launches a kernel without specifying a work group size. Implementations will use internal device specific magic to determine an ideal work group size. ... Eventually anyway, for this change it is just hardcoded as `{1, 1, 1}`.
1 parent 41e22aa commit 883393c

File tree

7 files changed

+293
-14
lines changed

7 files changed

+293
-14
lines changed

offload/liboffload/API/Kernel.td

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,3 +59,38 @@ def : Function {
5959
Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
6060
];
6161
}
62+
63+
64+
def : Struct {
65+
let name = "ol_kernel_launch_size_suggested_args_t";
66+
let desc = "Size-related arguments for a kernel launch.";
67+
let members = [
68+
StructMember<"size_t", "Dimensions", "Number of work dimensions">,
69+
StructMember<"size_t", "NumItemsX", "Number of work items on the X dimension">,
70+
StructMember<"size_t", "NumItemsY", "Number of work items on the Y dimension">,
71+
StructMember<"size_t", "NumItemsZ", "Number of work items on the Z dimension">,
72+
StructMember<"size_t", "DynSharedMemory", "Size of dynamic shared memory in bytes.">
73+
];
74+
}
75+
76+
def : Function {
77+
let name = "olLaunchKernelSuggestedGroupSize";
78+
let desc = "Enqueue a kernel launch with the specified work items and parameters.";
79+
let details = [
80+
"Behaves the same as olLaunchKernel, but the implementation automatically determines optimal work group sizes"
81+
];
82+
let params = [
83+
Param<"ol_queue_handle_t", "Queue", "handle of the queue", PARAM_IN_OPTIONAL>,
84+
Param<"ol_device_handle_t", "Device", "handle of the device to execute on", PARAM_IN>,
85+
Param<"ol_kernel_handle_t", "Kernel", "handle of the kernel", PARAM_IN>,
86+
Param<"const void*", "ArgumentsData", "pointer to the kernel argument struct", PARAM_IN_OPTIONAL>,
87+
Param<"size_t", "ArgumentsSize", "size of the kernel argument struct", PARAM_IN>,
88+
Param<"const ol_kernel_launch_size_suggested_args_t*", "LaunchSizeArgs", "pointer to the struct containing launch size parameters", PARAM_IN>,
89+
Param<"ol_event_handle_t*", "EventOut", "optional recorded event for the enqueued operation", PARAM_OUT_OPTIONAL>
90+
];
91+
let returns = [
92+
Return<"OL_ERRC_INVALID_ARGUMENT", ["`Queue == NULL && EventOut != NULL`"]>,
93+
Return<"OL_ERRC_INVALID_ARGUMENT", ["`ArgumentsSize > 0 && ArgumentsData == NULL`"]>,
94+
Return<"OL_ERRC_INVALID_DEVICE", ["If Queue is non-null but does not belong to Device"]>,
95+
];
96+
}

offload/liboffload/include/generated/OffloadAPI.h

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -723,6 +723,54 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernel(
723723
// [out][optional] optional recorded event for the enqueued operation
724724
ol_event_handle_t *EventOut);
725725

726+
///////////////////////////////////////////////////////////////////////////////
727+
/// @brief Size-related arguments for a kernel launch.
728+
typedef struct ol_kernel_launch_size_suggested_args_t {
729+
size_t Dimensions; /// Number of work dimensions
730+
size_t NumItemsX; /// Number of work items on the X dimension
731+
size_t NumItemsY; /// Number of work items on the Y dimension
732+
size_t NumItemsZ; /// Number of work items on the Z dimension
733+
size_t DynSharedMemory; /// Size of dynamic shared memory in bytes.
734+
} ol_kernel_launch_size_suggested_args_t;
735+
736+
///////////////////////////////////////////////////////////////////////////////
737+
/// @brief Enqueue a kernel launch with the specified work items and parameters.
738+
///
739+
/// @details
740+
/// - Behaves the same as olLaunchKernel, but the implementation
741+
/// automatically determines optimal work group sizes
742+
///
743+
/// @returns
744+
/// - ::OL_RESULT_SUCCESS
745+
/// - ::OL_ERRC_UNINITIALIZED
746+
/// - ::OL_ERRC_DEVICE_LOST
747+
/// - ::OL_ERRC_INVALID_ARGUMENT
748+
/// + `Queue == NULL && EventOut != NULL`
749+
/// - ::OL_ERRC_INVALID_ARGUMENT
750+
/// + `ArgumentsSize > 0 && ArgumentsData == NULL`
751+
/// - ::OL_ERRC_INVALID_DEVICE
752+
/// + If Queue is non-null but does not belong to Device
753+
/// - ::OL_ERRC_INVALID_NULL_HANDLE
754+
/// + `NULL == Device`
755+
/// + `NULL == Kernel`
756+
/// - ::OL_ERRC_INVALID_NULL_POINTER
757+
/// + `NULL == LaunchSizeArgs`
758+
OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
759+
// [in][optional] handle of the queue
760+
ol_queue_handle_t Queue,
761+
// [in] handle of the device to execute on
762+
ol_device_handle_t Device,
763+
// [in] handle of the kernel
764+
ol_kernel_handle_t Kernel,
765+
// [in][optional] pointer to the kernel argument struct
766+
const void *ArgumentsData,
767+
// [in] size of the kernel argument struct
768+
size_t ArgumentsSize,
769+
// [in] pointer to the struct containing launch size parameters
770+
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
771+
// [out][optional] optional recorded event for the enqueued operation
772+
ol_event_handle_t *EventOut);
773+
726774
///////////////////////////////////////////////////////////////////////////////
727775
/// @brief Function parameters for olGetPlatformInfo
728776
/// @details Each entry is a pointer to the parameter passed to the function;
@@ -874,6 +922,19 @@ typedef struct ol_launch_kernel_params_t {
874922
ol_event_handle_t **pEventOut;
875923
} ol_launch_kernel_params_t;
876924

925+
///////////////////////////////////////////////////////////////////////////////
926+
/// @brief Function parameters for olLaunchKernelSuggestedGroupSize
927+
/// @details Each entry is a pointer to the parameter passed to the function;
928+
typedef struct ol_launch_kernel_suggested_group_size_params_t {
929+
ol_queue_handle_t *pQueue;
930+
ol_device_handle_t *pDevice;
931+
ol_kernel_handle_t *pKernel;
932+
const void **pArgumentsData;
933+
size_t *pArgumentsSize;
934+
const ol_kernel_launch_size_suggested_args_t **pLaunchSizeArgs;
935+
ol_event_handle_t **pEventOut;
936+
} ol_launch_kernel_suggested_group_size_params_t;
937+
877938
///////////////////////////////////////////////////////////////////////////////
878939
/// @brief Variant of olInit that also sets source code location information
879940
/// @details See also ::olInit
@@ -1016,6 +1077,16 @@ OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelWithCodeLoc(
10161077
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
10171078
ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
10181079

1080+
///////////////////////////////////////////////////////////////////////////////
1081+
/// @brief Variant of olLaunchKernelSuggestedGroupSize that also sets source
1082+
/// code location information
1083+
/// @details See also ::olLaunchKernelSuggestedGroupSize
1084+
OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSizeWithCodeLoc(
1085+
ol_queue_handle_t Queue, ol_device_handle_t Device,
1086+
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
1087+
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
1088+
ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation);
1089+
10191090
#if defined(__cplusplus)
10201091
} // extern "C"
10211092
#endif

offload/liboffload/include/generated/OffloadEntryPoints.inc

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -901,3 +901,82 @@ ol_result_t olLaunchKernelWithCodeLoc(
901901
currentCodeLocation() = nullptr;
902902
return Result;
903903
}
904+
905+
///////////////////////////////////////////////////////////////////////////////
906+
llvm::Error olLaunchKernelSuggestedGroupSize_val(
907+
ol_queue_handle_t Queue, ol_device_handle_t Device,
908+
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
909+
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
910+
ol_event_handle_t *EventOut) {
911+
if (offloadConfig().ValidationEnabled) {
912+
if (Queue == NULL && EventOut != NULL) {
913+
return createOffloadError(
914+
error::ErrorCode::INVALID_ARGUMENT,
915+
"validation failure: Queue == NULL && EventOut != NULL");
916+
}
917+
918+
if (ArgumentsSize > 0 && ArgumentsData == NULL) {
919+
return createOffloadError(
920+
error::ErrorCode::INVALID_ARGUMENT,
921+
"validation failure: ArgumentsSize > 0 && ArgumentsData == NULL");
922+
}
923+
924+
if (NULL == Device) {
925+
return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
926+
"validation failure: NULL == Device");
927+
}
928+
929+
if (NULL == Kernel) {
930+
return createOffloadError(error::ErrorCode::INVALID_NULL_HANDLE,
931+
"validation failure: NULL == Kernel");
932+
}
933+
934+
if (NULL == LaunchSizeArgs) {
935+
return createOffloadError(error::ErrorCode::INVALID_NULL_POINTER,
936+
"validation failure: NULL == LaunchSizeArgs");
937+
}
938+
}
939+
940+
return llvm::offload::olLaunchKernelSuggestedGroupSize_impl(
941+
Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
942+
EventOut);
943+
}
944+
OL_APIEXPORT ol_result_t OL_APICALL olLaunchKernelSuggestedGroupSize(
945+
ol_queue_handle_t Queue, ol_device_handle_t Device,
946+
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
947+
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
948+
ol_event_handle_t *EventOut) {
949+
if (offloadConfig().TracingEnabled) {
950+
llvm::errs() << "---> olLaunchKernelSuggestedGroupSize";
951+
}
952+
953+
ol_result_t Result =
954+
llvmErrorToOffloadError(olLaunchKernelSuggestedGroupSize_val(
955+
Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
956+
EventOut));
957+
958+
if (offloadConfig().TracingEnabled) {
959+
ol_launch_kernel_suggested_group_size_params_t Params = {
960+
&Queue, &Device, &Kernel, &ArgumentsData,
961+
&ArgumentsSize, &LaunchSizeArgs, &EventOut};
962+
llvm::errs() << "(" << &Params << ")";
963+
llvm::errs() << "-> " << Result << "\n";
964+
if (Result && Result->Details) {
965+
llvm::errs() << " *Error Details* " << Result->Details << " \n";
966+
}
967+
}
968+
return Result;
969+
}
970+
ol_result_t olLaunchKernelSuggestedGroupSizeWithCodeLoc(
971+
ol_queue_handle_t Queue, ol_device_handle_t Device,
972+
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
973+
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
974+
ol_event_handle_t *EventOut, ol_code_location_t *CodeLocation) {
975+
currentCodeLocation() = CodeLocation;
976+
ol_result_t Result = ::olLaunchKernelSuggestedGroupSize(
977+
Queue, Device, Kernel, ArgumentsData, ArgumentsSize, LaunchSizeArgs,
978+
EventOut);
979+
980+
currentCodeLocation() = nullptr;
981+
return Result;
982+
}

offload/liboffload/include/generated/OffloadFuncs.inc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ OFFLOAD_FUNC(olCreateProgram)
2929
OFFLOAD_FUNC(olDestroyProgram)
3030
OFFLOAD_FUNC(olGetKernel)
3131
OFFLOAD_FUNC(olLaunchKernel)
32+
OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSize)
3233
OFFLOAD_FUNC(olInitWithCodeLoc)
3334
OFFLOAD_FUNC(olShutDownWithCodeLoc)
3435
OFFLOAD_FUNC(olGetPlatformInfoWithCodeLoc)
@@ -48,5 +49,6 @@ OFFLOAD_FUNC(olCreateProgramWithCodeLoc)
4849
OFFLOAD_FUNC(olDestroyProgramWithCodeLoc)
4950
OFFLOAD_FUNC(olGetKernelWithCodeLoc)
5051
OFFLOAD_FUNC(olLaunchKernelWithCodeLoc)
52+
OFFLOAD_FUNC(olLaunchKernelSuggestedGroupSizeWithCodeLoc)
5153

5254
#undef OFFLOAD_FUNC

offload/liboffload/include/generated/OffloadImplFuncDecls.inc

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,9 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
5858
size_t ArgumentsSize,
5959
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
6060
ol_event_handle_t *EventOut);
61+
62+
Error olLaunchKernelSuggestedGroupSize_impl(
63+
ol_queue_handle_t Queue, ol_device_handle_t Device,
64+
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
65+
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
66+
ol_event_handle_t *EventOut);

offload/liboffload/include/generated/OffloadPrint.hpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,31 @@ operator<<(llvm::raw_ostream &os,
392392
os << "}";
393393
return os;
394394
}
395+
///////////////////////////////////////////////////////////////////////////////
396+
/// @brief Print operator for the ol_kernel_launch_size_suggested_args_t type
397+
/// @returns llvm::raw_ostream &
398+
399+
inline llvm::raw_ostream &
400+
operator<<(llvm::raw_ostream &os,
401+
const struct ol_kernel_launch_size_suggested_args_t params) {
402+
os << "(struct ol_kernel_launch_size_suggested_args_t){";
403+
os << ".Dimensions = ";
404+
os << params.Dimensions;
405+
os << ", ";
406+
os << ".NumItemsX = ";
407+
os << params.NumItemsX;
408+
os << ", ";
409+
os << ".NumItemsY = ";
410+
os << params.NumItemsY;
411+
os << ", ";
412+
os << ".NumItemsZ = ";
413+
os << params.NumItemsZ;
414+
os << ", ";
415+
os << ".DynSharedMemory = ";
416+
os << params.DynSharedMemory;
417+
os << "}";
418+
return os;
419+
}
395420

396421
inline llvm::raw_ostream &
397422
operator<<(llvm::raw_ostream &os,
@@ -619,6 +644,32 @@ operator<<(llvm::raw_ostream &os,
619644
return os;
620645
}
621646

647+
inline llvm::raw_ostream &operator<<(
648+
llvm::raw_ostream &os,
649+
const struct ol_launch_kernel_suggested_group_size_params_t *params) {
650+
os << ".Queue = ";
651+
printPtr(os, *params->pQueue);
652+
os << ", ";
653+
os << ".Device = ";
654+
printPtr(os, *params->pDevice);
655+
os << ", ";
656+
os << ".Kernel = ";
657+
printPtr(os, *params->pKernel);
658+
os << ", ";
659+
os << ".ArgumentsData = ";
660+
printPtr(os, *params->pArgumentsData);
661+
os << ", ";
662+
os << ".ArgumentsSize = ";
663+
os << *params->pArgumentsSize;
664+
os << ", ";
665+
os << ".LaunchSizeArgs = ";
666+
printPtr(os, *params->pLaunchSizeArgs);
667+
os << ", ";
668+
os << ".EventOut = ";
669+
printPtr(os, *params->pEventOut);
670+
return os;
671+
}
672+
622673
///////////////////////////////////////////////////////////////////////////////
623674
// @brief Print pointer value
624675
template <typename T>

offload/liboffload/src/OffloadImpl.cpp

Lines changed: 49 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -484,11 +484,10 @@ Error olGetKernel_impl(ol_program_handle_t Program, const char *KernelName,
484484
return Error::success();
485485
}
486486

487-
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
488-
ol_kernel_handle_t Kernel, const void *ArgumentsData,
489-
size_t ArgumentsSize,
490-
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
491-
ol_event_handle_t *EventOut) {
487+
namespace {
488+
Error do_launch(ol_queue_handle_t Queue, ol_device_handle_t Device,
489+
ol_kernel_handle_t Kernel, KernelArgsTy &Args,
490+
ol_event_handle_t *EventOut) {
492491
auto *DeviceImpl = Device->Device;
493492
if (Queue && Device != Queue->Device) {
494493
return createOffloadError(
@@ -498,6 +497,26 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
498497

499498
auto *QueueImpl = Queue ? Queue->AsyncInfo : nullptr;
500499
AsyncInfoWrapperTy AsyncInfoWrapper(*DeviceImpl, QueueImpl);
500+
auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
501+
auto Err = KernelImpl->launch(*DeviceImpl, Args.ArgPtrs, nullptr, Args,
502+
AsyncInfoWrapper);
503+
504+
AsyncInfoWrapper.finalize(Err);
505+
if (Err)
506+
return Err;
507+
508+
if (EventOut)
509+
*EventOut = makeEvent(Queue);
510+
511+
return Error::success();
512+
}
513+
} // namespace
514+
515+
Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
516+
ol_kernel_handle_t Kernel, const void *ArgumentsData,
517+
size_t ArgumentsSize,
518+
const ol_kernel_launch_size_args_t *LaunchSizeArgs,
519+
ol_event_handle_t *EventOut) {
501520
KernelArgsTy LaunchArgs{};
502521
LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumGroupsX;
503522
LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumGroupsY;
@@ -514,18 +533,34 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
514533
// Don't do anything with pointer indirection; use arg data as-is
515534
LaunchArgs.Flags.IsCUDA = true;
516535

517-
auto *KernelImpl = reinterpret_cast<GenericKernelTy *>(Kernel);
518-
auto Err = KernelImpl->launch(*DeviceImpl, LaunchArgs.ArgPtrs, nullptr,
519-
LaunchArgs, AsyncInfoWrapper);
536+
return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
537+
}
520538

521-
AsyncInfoWrapper.finalize(Err);
522-
if (Err)
523-
return Err;
539+
Error olLaunchKernelSuggestedGroupSize_impl(
540+
ol_queue_handle_t Queue, ol_device_handle_t Device,
541+
ol_kernel_handle_t Kernel, const void *ArgumentsData, size_t ArgumentsSize,
542+
const ol_kernel_launch_size_suggested_args_t *LaunchSizeArgs,
543+
ol_event_handle_t *EventOut) {
544+
// TODO: Use backend specific magic to determine the best work group size
545+
size_t PreferredSize[3] = {1, 1, 1};
524546

525-
if (EventOut)
526-
*EventOut = makeEvent(Queue);
547+
KernelArgsTy LaunchArgs{};
548+
LaunchArgs.NumTeams[0] = LaunchSizeArgs->NumItemsX / PreferredSize[0];
549+
LaunchArgs.NumTeams[1] = LaunchSizeArgs->NumItemsY / PreferredSize[1];
550+
LaunchArgs.NumTeams[2] = LaunchSizeArgs->NumItemsZ / PreferredSize[2];
551+
LaunchArgs.ThreadLimit[0] = PreferredSize[0];
552+
LaunchArgs.ThreadLimit[1] = PreferredSize[1];
553+
LaunchArgs.ThreadLimit[2] = PreferredSize[2];
554+
LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
527555

528-
return Error::success();
556+
KernelLaunchParamsTy Params;
557+
Params.Data = const_cast<void *>(ArgumentsData);
558+
Params.Size = ArgumentsSize;
559+
LaunchArgs.ArgPtrs = reinterpret_cast<void **>(&Params);
560+
// Don't do anything with pointer indirection; use arg data as-is
561+
LaunchArgs.Flags.IsCUDA = true;
562+
563+
return do_launch(Queue, Device, Kernel, LaunchArgs, EventOut);
529564
}
530565

531566
} // namespace offload

0 commit comments

Comments
 (0)