Skip to content

Commit 55f2d02

Browse files
author
Victor Lomuller
committed
Add new launch property to support work_group_scratch_memory
intel/llvm#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used. In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp. The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension. Signed-off-by: Victor Lomuller <[email protected]>
1 parent cd92e72 commit 55f2d02

File tree

17 files changed

+233
-68
lines changed

17 files changed

+233
-68
lines changed

include/ur_api.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9530,6 +9530,7 @@ typedef enum ur_exp_launch_property_id_t {
95309530
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
95319531
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
95329532
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
9533+
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
95339534
/// @cond
95349535
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
95359536
/// @endcond
@@ -9543,10 +9544,12 @@ typedef enum ur_exp_launch_property_id_t {
95439544
/// _Analogues_
95449545
/// - **CUlaunchAttributeValue**
95459546
typedef union ur_exp_launch_property_value_t {
9546-
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9547-
///< value must be a divisor of the corresponding global work-size
9548-
///< dimension (in units of work-group).
9549-
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9547+
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9548+
///< value must be a divisor of the corresponding global work-size
9549+
///< dimension (in units of work-group).
9550+
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9551+
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
9552+
///< allocate
95509553

95519554
} ur_exp_launch_property_value_t;
95529555

@@ -9587,6 +9590,7 @@ typedef struct ur_exp_launch_property_t {
95879590
/// + NULL == hQueue
95889591
/// + NULL == hKernel
95899592
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9593+
/// + `NULL == pGlobalWorkOffset`
95909594
/// + `NULL == pGlobalWorkSize`
95919595
/// + `NULL == launchPropList`
95929596
/// + NULL == pGlobalWorkSize
@@ -9615,6 +9619,8 @@ urEnqueueKernelLaunchCustomExp(
96159619
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
96169620
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
96179621
///< work-group work-items
9622+
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
9623+
///< offset used to calculate the global ID of a work-item
96189624
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
96199625
///< number of global work-items in workDim that will execute the kernel
96209626
///< function
@@ -11441,6 +11447,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
1144111447
ur_queue_handle_t *phQueue;
1144211448
ur_kernel_handle_t *phKernel;
1144311449
uint32_t *pworkDim;
11450+
const size_t **ppGlobalWorkOffset;
1144411451
const size_t **ppGlobalWorkSize;
1144511452
const size_t **ppLocalWorkSize;
1144611453
uint32_t *pnumPropsInLaunchPropList;

include/ur_ddi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1457,6 +1457,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
14571457
uint32_t,
14581458
const size_t *,
14591459
const size_t *,
1460+
const size_t *,
14601461
uint32_t,
14611462
const ur_exp_launch_property_t *,
14621463
uint32_t,

include/ur_print.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10246,6 +10246,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
1024610246
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
1024710247
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
1024810248
break;
10249+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10250+
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
10251+
break;
1024910252
default:
1025010253
os << "unknown enumerator";
1025110254
break;
@@ -10282,6 +10285,13 @@ inline ur_result_t printUnion(
1028210285

1028310286
os << (params.cooperative);
1028410287

10288+
break;
10289+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10290+
10291+
os << ".workgroup_mem_size = ";
10292+
10293+
os << (params.workgroup_mem_size);
10294+
1028510295
break;
1028610296
default:
1028710297
os << "<unknown>";
@@ -14722,6 +14732,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1472214732

1472314733
os << *(params->pworkDim);
1472414734

14735+
os << ", ";
14736+
os << ".pGlobalWorkOffset = ";
14737+
14738+
ur::details::printPtr(os,
14739+
*(params->ppGlobalWorkOffset));
14740+
1472514741
os << ", ";
1472614742
os << ".pGlobalWorkSize = ";
1472714743

scripts/core/exp-launch-properties.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ etors:
2929
desc: "Whether to launch a cooperative kernel"
3030
- name: CLUSTER_DIMENSION
3131
desc: "work-group cluster dimensions"
32+
- name: WORK_GROUP_MEMORY
33+
desc: "Implicit work group memory allocation"
3234
--- #--------------------------------------------------------------------------
3335
type: union
3436
desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
4547
name: cooperative
4648
desc: "[in] non-zero value indicates a cooperative kernel"
4749
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
50+
- type: size_t
51+
name: workgroup_mem_size
52+
desc: "[in] non-zero value indicates the amount of work group memory to allocate"
53+
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
4854
--- #--------------------------------------------------------------------------
4955
type: struct
5056
desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
8288
- type: uint32_t
8389
name: workDim
8490
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
91+
- type: "const size_t*"
92+
name: pGlobalWorkOffset
93+
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
8594
- type: const size_t*
8695
name: pGlobalWorkSize
8796
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
97106
- type: uint32_t
98107
name: numEventsInWaitList
99108
desc: "[in] size of the event wait list"
100-
- type: const ur_event_handle_t*
109+
- type: const $x_event_handle_t*
101110
name: phEventWaitList
102111
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
103-
- type: ur_event_handle_t*
112+
- type: $x_event_handle_t*
104113
name: phEvent
105114
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
106115
returns:

source/adapters/cuda/enqueue.cpp

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -414,11 +414,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
414414
phEventWaitList, phEvent);
415415
}
416416

417-
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
418-
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
419-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
420-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
421-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
417+
static ur_result_t
418+
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
419+
uint32_t workDim, const size_t *pGlobalWorkOffset,
420+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
421+
uint32_t numEventsInWaitList,
422+
const ur_event_handle_t *phEventWaitList,
423+
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
422424
// Preconditions
423425
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
424426
UR_RESULT_ERROR_INVALID_KERNEL);
@@ -436,6 +438,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
436438
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
437439
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
438440

441+
// Set work group memory so we can compute the whole memory requirement
442+
if (WorkGroupMemory)
443+
hKernel->setWorkGroupMemory(WorkGroupMemory);
439444
uint32_t LocalSize = hKernel->getLocalSize();
440445
CUfunction CuFunc = hKernel->get();
441446

@@ -498,6 +503,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
498503
return UR_RESULT_SUCCESS;
499504
}
500505

506+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
507+
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
508+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
509+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
510+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
511+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
512+
pGlobalWorkSize, pLocalWorkSize,
513+
numEventsInWaitList, phEventWaitList, phEvent, 0);
514+
}
515+
501516
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
502517
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
503518
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -508,8 +523,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
508523
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
509524
coop_prop.value.cooperative = 1;
510525
return urEnqueueKernelLaunchCustomExp(
511-
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
512-
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
526+
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
527+
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
528+
phEvent);
513529
}
514530
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
515531
pGlobalWorkSize, pLocalWorkSize,
@@ -518,16 +534,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
518534

519535
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
520536
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
521-
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
522-
uint32_t numPropsInLaunchPropList,
537+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
538+
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
523539
const ur_exp_launch_property_t *launchPropList,
524540
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
525541
ur_event_handle_t *phEvent) {
526542

527-
if (numPropsInLaunchPropList == 0) {
528-
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
529-
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
530-
phEvent);
543+
size_t WorkGroupMemory = [&]() -> size_t {
544+
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
545+
launchPropList, launchPropList + numPropsInLaunchPropList,
546+
[](const ur_exp_launch_property_t &Prop) {
547+
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
548+
});
549+
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
550+
return WorkGroupMemoryProp->value.workgroup_mem_size;
551+
return 0;
552+
}();
553+
554+
if (numPropsInLaunchPropList == 0 ||
555+
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
556+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
557+
pGlobalWorkSize, pLocalWorkSize,
558+
numEventsInWaitList, phEventWaitList, phEvent,
559+
WorkGroupMemory);
531560
}
532561
#if CUDA_VERSION >= 11080
533562
// Preconditions
@@ -540,7 +569,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
540569
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
541570
}
542571

543-
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
572+
std::vector<CUlaunchAttribute> launch_attribute;
573+
launch_attribute.reserve(numPropsInLaunchPropList);
544574

545575
// Early exit for zero size kernel
546576
if (*pGlobalWorkSize == 0) {
@@ -553,17 +583,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
553583
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
554584
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
555585

586+
// Set work group memory so we can compute the whole memory requirement
587+
if (WorkGroupMemory)
588+
hKernel->setWorkGroupMemory(WorkGroupMemory);
556589
uint32_t LocalSize = hKernel->getLocalSize();
557590
CUfunction CuFunc = hKernel->get();
558591

559592
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
560593
switch (launchPropList[i].id) {
561594
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
595+
launch_attribute.push_back({});
562596
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
563597
break;
564598
}
565599
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
566-
600+
launch_attribute.push_back({});
567601
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
568602
// Note that cuda orders from right to left wrt SYCL dimensional order.
569603
if (workDim == 3) {
@@ -595,11 +629,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
595629
break;
596630
}
597631
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
632+
launch_attribute.push_back({});
598633
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
599634
launch_attribute[i].value.cooperative =
600635
launchPropList[i].value.cooperative;
601636
break;
602637
}
638+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
639+
break;
640+
}
603641
default: {
604642
return UR_RESULT_ERROR_INVALID_ENUMERATION;
605643
}
@@ -610,8 +648,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
610648
// using the standard UR_CHECK_ERROR
611649
if (ur_result_t Ret =
612650
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
613-
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
614-
CuFunc, ThreadsPerBlock, BlocksPerGrid);
651+
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
652+
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
615653
Ret != UR_RESULT_SUCCESS)
616654
return Ret;
617655

@@ -659,7 +697,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
659697
launch_config.sharedMemBytes = LocalSize;
660698
launch_config.hStream = CuStream;
661699
launch_config.attrs = &launch_attribute[0];
662-
launch_config.numAttrs = numPropsInLaunchPropList;
700+
launch_config.numAttrs = launch_attribute.size();
663701

664702
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
665703
const_cast<void **>(ArgIndices.data()),

source/adapters/cuda/kernel.hpp

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ struct ur_kernel_handle_t_ {
6565
args_size_t ParamSizes;
6666
args_index_t Indices;
6767
args_size_t OffsetPerIndex;
68+
size_t WorkGroupMemory = 0;
69+
6870
// A struct to keep track of memargs so that we can do dependency analysis
6971
// at urEnqueueKernelLaunch
7072
struct mem_obj_arg {
@@ -105,22 +107,28 @@ struct ur_kernel_handle_t_ {
105107
OffsetPerIndex[Index] = LocalSize;
106108
}
107109

108-
void addLocalArg(size_t Index, size_t Size) {
109-
size_t LocalOffset = this->getLocalSize();
110-
111-
// maximum required alignment is the size of the largest vector type
112-
const size_t MaxAlignment = sizeof(double) * 16;
110+
// maximum required alignment is the size of the largest vector type
111+
static constexpr size_t MaxAlignment = sizeof(double) * 16;
113112

113+
static size_t alignMemoryAllocation(size_t Size, size_t Offset) {
114114
// for arguments smaller than the maximum alignment simply align to the
115115
// size of the argument
116116
const size_t Alignment = std::min(MaxAlignment, Size);
117117

118118
// align the argument
119-
size_t AlignedLocalOffset = LocalOffset;
120-
size_t Pad = LocalOffset % Alignment;
119+
size_t AlignedLocalOffset = Offset;
120+
size_t Pad = Offset % Alignment;
121121
if (Pad != 0) {
122122
AlignedLocalOffset += Alignment - Pad;
123123
}
124+
return AlignedLocalOffset;
125+
}
126+
127+
void addLocalArg(size_t Index, size_t Size) {
128+
size_t LocalOffset = this->getLocalSize();
129+
130+
// align the argument
131+
size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset);
124132

125133
addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
126134
Size + (AlignedLocalOffset - LocalOffset));
@@ -140,20 +148,40 @@ struct ur_kernel_handle_t_ {
140148
MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
141149
}
142150

151+
void setWorkGroupMemory(size_t memSize) {
152+
assert(WorkGroupMemory == 0 &&
153+
"Work Group Memory size can only be set once");
154+
// Ensure first offset is MaxAlignment aligned
155+
WorkGroupMemory = alignMemoryAllocation(MaxAlignment, memSize);
156+
157+
// Adjust local accessor setting
158+
// the dynamic memory will start at offset 0 (allows us to keep access
159+
// local memory as a GV) and accessors will use the rest of the range
160+
for (size_t i = 0; i < OffsetPerIndex.size(); i++) {
161+
// if offset is 0, that's it is not a local accessor argument.
162+
if (!OffsetPerIndex[i])
163+
continue;
164+
assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t");
165+
*reinterpret_cast<size_t *>(Indices[i]) += WorkGroupMemory;
166+
}
167+
}
168+
143169
void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
144170
assert(Size == sizeof(std::uint32_t) * 3);
145171
std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
146172
}
147173

148174
void clearLocalSize() {
149175
std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
176+
WorkGroupMemory = 0;
150177
}
151178

152179
const args_index_t &getIndices() const noexcept { return Indices; }
153180

154181
uint32_t getLocalSize() const {
155182
return std::accumulate(std::begin(OffsetPerIndex),
156-
std::end(OffsetPerIndex), 0);
183+
std::end(OffsetPerIndex), 0) +
184+
WorkGroupMemory;
157185
}
158186
} Args;
159187

@@ -238,6 +266,7 @@ struct ur_kernel_handle_t_ {
238266
return Args.getIndices();
239267
}
240268

269+
void setWorkGroupMemory(size_t memSize) { Args.setWorkGroupMemory(memSize); }
241270
uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
242271

243272
void clearLocalSize() { Args.clearLocalSize(); }

0 commit comments

Comments
 (0)