Skip to content

Commit ae9804c

Browse files
author
Victor Lomuller
committed
Add new launch property to support work_group_scratch_memory
intel/llvm#15061 introduces a new property work_group_scratch_memory which allow the user to set a given amount of local memory to be used. In order to pass this information to the adaptor, the patch adds a new launch property to urEnqueueKernelLaunchCustomExp. The patch also changes the signature of urEnqueueKernelLaunchCustomExp to add global offset in order to maintain features when using this extension. Signed-off-by: Victor Lomuller <[email protected]>
1 parent 30391c6 commit ae9804c

File tree

17 files changed

+233
-68
lines changed

17 files changed

+233
-68
lines changed

include/ur_api.h

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9537,6 +9537,7 @@ typedef enum ur_exp_launch_property_id_t {
95379537
UR_EXP_LAUNCH_PROPERTY_ID_IGNORE = 0, ///< The property has no effect
95389538
UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE = 1, ///< Whether to launch a cooperative kernel
95399539
UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION = 2, ///< work-group cluster dimensions
9540+
UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY = 3, ///< Implicit work group memory allocation
95409541
/// @cond
95419542
UR_EXP_LAUNCH_PROPERTY_ID_FORCE_UINT32 = 0x7fffffff
95429543
/// @endcond
@@ -9550,10 +9551,12 @@ typedef enum ur_exp_launch_property_id_t {
95509551
/// _Analogues_
95519552
/// - **CUlaunchAttributeValue**
95529553
typedef union ur_exp_launch_property_value_t {
9553-
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9554-
///< value must be a divisor of the corresponding global work-size
9555-
///< dimension (in units of work-group).
9556-
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9554+
uint32_t clusterDim[3]; ///< [in] dimensions of the cluster (units of work-group) (x, y, z). Each
9555+
///< value must be a divisor of the corresponding global work-size
9556+
///< dimension (in units of work-group).
9557+
int cooperative; ///< [in] non-zero value indicates a cooperative kernel
9558+
size_t workgroup_mem_size; ///< [in] non-zero value indicates the amount of work group memory to
9559+
///< allocate
95579560

95589561
} ur_exp_launch_property_value_t;
95599562

@@ -9594,6 +9597,7 @@ typedef struct ur_exp_launch_property_t {
95949597
/// + NULL == hQueue
95959598
/// + NULL == hKernel
95969599
/// - ::UR_RESULT_ERROR_INVALID_NULL_POINTER
9600+
/// + `NULL == pGlobalWorkOffset`
95979601
/// + `NULL == pGlobalWorkSize`
95989602
/// + `NULL == launchPropList`
95999603
/// + NULL == pGlobalWorkSize
@@ -9622,6 +9626,8 @@ urEnqueueKernelLaunchCustomExp(
96229626
ur_kernel_handle_t hKernel, ///< [in] handle of the kernel object
96239627
uint32_t workDim, ///< [in] number of dimensions, from 1 to 3, to specify the global and
96249628
///< work-group work-items
9629+
const size_t *pGlobalWorkOffset, ///< [in] pointer to an array of workDim unsigned values that specify the
9630+
///< offset used to calculate the global ID of a work-item
96259631
const size_t *pGlobalWorkSize, ///< [in] pointer to an array of workDim unsigned values that specify the
96269632
///< number of global work-items in workDim that will execute the kernel
96279633
///< function
@@ -11531,6 +11537,7 @@ typedef struct ur_enqueue_kernel_launch_custom_exp_params_t {
1153111537
ur_queue_handle_t *phQueue;
1153211538
ur_kernel_handle_t *phKernel;
1153311539
uint32_t *pworkDim;
11540+
const size_t **ppGlobalWorkOffset;
1153411541
const size_t **ppGlobalWorkSize;
1153511542
const size_t **ppLocalWorkSize;
1153611543
uint32_t *pnumPropsInLaunchPropList;

include/ur_ddi.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1467,6 +1467,7 @@ typedef ur_result_t(UR_APICALL *ur_pfnEnqueueKernelLaunchCustomExp_t)(
14671467
uint32_t,
14681468
const size_t *,
14691469
const size_t *,
1470+
const size_t *,
14701471
uint32_t,
14711472
const ur_exp_launch_property_t *,
14721473
uint32_t,

include/ur_print.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10290,6 +10290,9 @@ inline std::ostream &operator<<(std::ostream &os, enum ur_exp_launch_property_id
1029010290
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION:
1029110291
os << "UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION";
1029210292
break;
10293+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10294+
os << "UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY";
10295+
break;
1029310296
default:
1029410297
os << "unknown enumerator";
1029510298
break;
@@ -10326,6 +10329,13 @@ inline ur_result_t printUnion(
1032610329

1032710330
os << (params.cooperative);
1032810331

10332+
break;
10333+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY:
10334+
10335+
os << ".workgroup_mem_size = ";
10336+
10337+
os << (params.workgroup_mem_size);
10338+
1032910339
break;
1033010340
default:
1033110341
os << "<unknown>";
@@ -14837,6 +14847,12 @@ inline std::ostream &operator<<(std::ostream &os, [[maybe_unused]] const struct
1483714847

1483814848
os << *(params->pworkDim);
1483914849

14850+
os << ", ";
14851+
os << ".pGlobalWorkOffset = ";
14852+
14853+
ur::details::printPtr(os,
14854+
*(params->ppGlobalWorkOffset));
14855+
1484014856
os << ", ";
1484114857
os << ".pGlobalWorkSize = ";
1484214858

scripts/core/exp-launch-properties.yml

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ etors:
2929
desc: "Whether to launch a cooperative kernel"
3030
- name: CLUSTER_DIMENSION
3131
desc: "work-group cluster dimensions"
32+
- name: WORK_GROUP_MEMORY
33+
desc: "Implicit work group memory allocation"
3234
--- #--------------------------------------------------------------------------
3335
type: union
3436
desc: "Specifies a launch property value"
@@ -45,6 +47,10 @@ members:
4547
name: cooperative
4648
desc: "[in] non-zero value indicates a cooperative kernel"
4749
tag: $X_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE
50+
- type: size_t
51+
name: workgroup_mem_size
52+
desc: "[in] non-zero value indicates the amount of work group memory to allocate"
53+
tag: $X_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY
4854
--- #--------------------------------------------------------------------------
4955
type: struct
5056
desc: "Kernel launch property"
@@ -82,6 +88,9 @@ params:
8288
- type: uint32_t
8389
name: workDim
8490
desc: "[in] number of dimensions, from 1 to 3, to specify the global and work-group work-items"
91+
- type: "const size_t*"
92+
name: pGlobalWorkOffset
93+
desc: "[in] pointer to an array of workDim unsigned values that specify the offset used to calculate the global ID of a work-item"
8594
- type: const size_t*
8695
name: pGlobalWorkSize
8796
desc: "[in] pointer to an array of workDim unsigned values that specify the number of global work-items in workDim that will execute the kernel function"
@@ -97,10 +106,10 @@ params:
97106
- type: uint32_t
98107
name: numEventsInWaitList
99108
desc: "[in] size of the event wait list"
100-
- type: const ur_event_handle_t*
109+
- type: const $x_event_handle_t*
101110
name: phEventWaitList
102111
desc: "[in][optional][range(0, numEventsInWaitList)] pointer to a list of events that must be complete before the kernel execution. If nullptr, the numEventsInWaitList must be 0, indicating that no wait event. "
103-
- type: ur_event_handle_t*
112+
- type: $x_event_handle_t*
104113
name: phEvent
105114
desc: "[out][optional] return an event object that identifies this particular kernel execution instance. If phEventWaitList and phEvent are not NULL, phEvent must not refer to an element of the phEventWaitList array."
106115
returns:

source/adapters/cuda/enqueue.cpp

Lines changed: 56 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -422,11 +422,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait(
422422
phEventWaitList, phEvent);
423423
}
424424

425-
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
426-
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
427-
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
428-
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
429-
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
425+
static ur_result_t
426+
enqueueKernelLaunch(ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel,
427+
uint32_t workDim, const size_t *pGlobalWorkOffset,
428+
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
429+
uint32_t numEventsInWaitList,
430+
const ur_event_handle_t *phEventWaitList,
431+
ur_event_handle_t *phEvent, size_t WorkGroupMemory) {
430432
// Preconditions
431433
UR_ASSERT(hQueue->getDevice() == hKernel->getProgram()->getDevice(),
432434
UR_RESULT_ERROR_INVALID_KERNEL);
@@ -444,6 +446,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
444446
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
445447
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
446448

449+
// Set work group memory so we can compute the whole memory requirement
450+
if (WorkGroupMemory)
451+
hKernel->setWorkGroupMemory(WorkGroupMemory);
447452
uint32_t LocalSize = hKernel->getLocalSize();
448453
CUfunction CuFunc = hKernel->get();
449454

@@ -506,6 +511,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
506511
return UR_RESULT_SUCCESS;
507512
}
508513

514+
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
515+
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
516+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
517+
const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
518+
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
519+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
520+
pGlobalWorkSize, pLocalWorkSize,
521+
numEventsInWaitList, phEventWaitList, phEvent, 0);
522+
}
523+
509524
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
510525
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
511526
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
@@ -516,8 +531,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
516531
coop_prop.id = UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE;
517532
coop_prop.value.cooperative = 1;
518533
return urEnqueueKernelLaunchCustomExp(
519-
hQueue, hKernel, workDim, pGlobalWorkSize, pLocalWorkSize, 1,
520-
&coop_prop, numEventsInWaitList, phEventWaitList, phEvent);
534+
hQueue, hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize,
535+
pLocalWorkSize, 1, &coop_prop, numEventsInWaitList, phEventWaitList,
536+
phEvent);
521537
}
522538
return urEnqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
523539
pGlobalWorkSize, pLocalWorkSize,
@@ -526,16 +542,29 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCooperativeKernelLaunchExp(
526542

527543
UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
528544
ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
529-
const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize,
530-
uint32_t numPropsInLaunchPropList,
545+
const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
546+
const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList,
531547
const ur_exp_launch_property_t *launchPropList,
532548
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
533549
ur_event_handle_t *phEvent) {
534550

535-
if (numPropsInLaunchPropList == 0) {
536-
urEnqueueKernelLaunch(hQueue, hKernel, workDim, nullptr, pGlobalWorkSize,
537-
pLocalWorkSize, numEventsInWaitList, phEventWaitList,
538-
phEvent);
551+
size_t WorkGroupMemory = [&]() -> size_t {
552+
const ur_exp_launch_property_t *WorkGroupMemoryProp = std::find_if(
553+
launchPropList, launchPropList + numPropsInLaunchPropList,
554+
[](const ur_exp_launch_property_t &Prop) {
555+
return Prop.id == UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY;
556+
});
557+
if (WorkGroupMemoryProp != launchPropList + numPropsInLaunchPropList)
558+
return WorkGroupMemoryProp->value.workgroup_mem_size;
559+
return 0;
560+
}();
561+
562+
if (numPropsInLaunchPropList == 0 ||
563+
(WorkGroupMemory && numPropsInLaunchPropList == 1)) {
564+
return enqueueKernelLaunch(hQueue, hKernel, workDim, pGlobalWorkOffset,
565+
pGlobalWorkSize, pLocalWorkSize,
566+
numEventsInWaitList, phEventWaitList, phEvent,
567+
WorkGroupMemory);
539568
}
540569
#if CUDA_VERSION >= 11080
541570
// Preconditions
@@ -548,7 +577,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
548577
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
549578
}
550579

551-
std::vector<CUlaunchAttribute> launch_attribute(numPropsInLaunchPropList);
580+
std::vector<CUlaunchAttribute> launch_attribute;
581+
launch_attribute.reserve(numPropsInLaunchPropList);
552582

553583
// Early exit for zero size kernel
554584
if (*pGlobalWorkSize == 0) {
@@ -561,17 +591,21 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
561591
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
562592
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
563593

594+
// Set work group memory so we can compute the whole memory requirement
595+
if (WorkGroupMemory)
596+
hKernel->setWorkGroupMemory(WorkGroupMemory);
564597
uint32_t LocalSize = hKernel->getLocalSize();
565598
CUfunction CuFunc = hKernel->get();
566599

567600
for (uint32_t i = 0; i < numPropsInLaunchPropList; i++) {
568601
switch (launchPropList[i].id) {
569602
case UR_EXP_LAUNCH_PROPERTY_ID_IGNORE: {
603+
launch_attribute.push_back({});
570604
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_IGNORE;
571605
break;
572606
}
573607
case UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION: {
574-
608+
launch_attribute.push_back({});
575609
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION;
576610
// Note that cuda orders from right to left wrt SYCL dimensional order.
577611
if (workDim == 3) {
@@ -603,11 +637,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
603637
break;
604638
}
605639
case UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE: {
640+
launch_attribute.push_back({});
606641
launch_attribute[i].id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE;
607642
launch_attribute[i].value.cooperative =
608643
launchPropList[i].value.cooperative;
609644
break;
610645
}
646+
case UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY: {
647+
break;
648+
}
611649
default: {
612650
return UR_RESULT_ERROR_INVALID_ENUMERATION;
613651
}
@@ -618,8 +656,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
618656
// using the standard UR_CHECK_ERROR
619657
if (ur_result_t Ret =
620658
setKernelParams(hQueue->getContext(), hQueue->Device, workDim,
621-
nullptr, pGlobalWorkSize, pLocalWorkSize, hKernel,
622-
CuFunc, ThreadsPerBlock, BlocksPerGrid);
659+
pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize,
660+
hKernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
623661
Ret != UR_RESULT_SUCCESS)
624662
return Ret;
625663

@@ -667,7 +705,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunchCustomExp(
667705
launch_config.sharedMemBytes = LocalSize;
668706
launch_config.hStream = CuStream;
669707
launch_config.attrs = &launch_attribute[0];
670-
launch_config.numAttrs = numPropsInLaunchPropList;
708+
launch_config.numAttrs = launch_attribute.size();
671709

672710
UR_CHECK_ERROR(cuLaunchKernelEx(&launch_config, CuFunc,
673711
const_cast<void **>(ArgIndices.data()),

source/adapters/cuda/kernel.hpp

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,8 @@ struct ur_kernel_handle_t_ {
6565
args_size_t ParamSizes;
6666
args_index_t Indices;
6767
args_size_t OffsetPerIndex;
68+
size_t WorkGroupMemory = 0;
69+
6870
// A struct to keep track of memargs so that we can do dependency analysis
6971
// at urEnqueueKernelLaunch
7072
struct mem_obj_arg {
@@ -105,22 +107,28 @@ struct ur_kernel_handle_t_ {
105107
OffsetPerIndex[Index] = LocalSize;
106108
}
107109

108-
void addLocalArg(size_t Index, size_t Size) {
109-
size_t LocalOffset = this->getLocalSize();
110-
111-
// maximum required alignment is the size of the largest vector type
112-
const size_t MaxAlignment = sizeof(double) * 16;
110+
// maximum required alignment is the size of the largest vector type
111+
static constexpr size_t MaxAlignment = sizeof(double) * 16;
113112

113+
static size_t alignMemoryAllocation(size_t Size, size_t Offset) {
114114
// for arguments smaller than the maximum alignment simply align to the
115115
// size of the argument
116116
const size_t Alignment = std::min(MaxAlignment, Size);
117117

118118
// align the argument
119-
size_t AlignedLocalOffset = LocalOffset;
120-
size_t Pad = LocalOffset % Alignment;
119+
size_t AlignedLocalOffset = Offset;
120+
size_t Pad = Offset % Alignment;
121121
if (Pad != 0) {
122122
AlignedLocalOffset += Alignment - Pad;
123123
}
124+
return AlignedLocalOffset;
125+
}
126+
127+
void addLocalArg(size_t Index, size_t Size) {
128+
size_t LocalOffset = this->getLocalSize();
129+
130+
// align the argument
131+
size_t AlignedLocalOffset = alignMemoryAllocation(Size, LocalOffset);
124132

125133
addArg(Index, sizeof(size_t), (const void *)&(AlignedLocalOffset),
126134
Size + (AlignedLocalOffset - LocalOffset));
@@ -140,20 +148,40 @@ struct ur_kernel_handle_t_ {
140148
MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags});
141149
}
142150

151+
void setWorkGroupMemory(size_t memSize) {
152+
assert(WorkGroupMemory == 0 &&
153+
"Work Group Memory size can only be set once");
154+
// Ensure first offset is MaxAlignment aligned
155+
WorkGroupMemory = alignMemoryAllocation(MaxAlignment, memSize);
156+
157+
// Adjust local accessor setting
158+
// the dynamic memory will start at offset 0 (allows us to keep access
159+
// local memory as a GV) and accessors will use the rest of the range
160+
for (size_t i = 0; i < OffsetPerIndex.size(); i++) {
161+
// if offset is 0, that's it is not a local accessor argument.
162+
if (!OffsetPerIndex[i])
163+
continue;
164+
assert(ParamSizes[i] == sizeof(size_t) && "Offset should be a size_t");
165+
*reinterpret_cast<size_t *>(Indices[i]) += WorkGroupMemory;
166+
}
167+
}
168+
143169
void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) {
144170
assert(Size == sizeof(std::uint32_t) * 3);
145171
std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size);
146172
}
147173

148174
void clearLocalSize() {
149175
std::fill(std::begin(OffsetPerIndex), std::end(OffsetPerIndex), 0);
176+
WorkGroupMemory = 0;
150177
}
151178

152179
const args_index_t &getIndices() const noexcept { return Indices; }
153180

154181
uint32_t getLocalSize() const {
155182
return std::accumulate(std::begin(OffsetPerIndex),
156-
std::end(OffsetPerIndex), 0);
183+
std::end(OffsetPerIndex), 0) +
184+
WorkGroupMemory;
157185
}
158186
} Args;
159187

@@ -238,6 +266,7 @@ struct ur_kernel_handle_t_ {
238266
return Args.getIndices();
239267
}
240268

269+
void setWorkGroupMemory(size_t memSize) { Args.setWorkGroupMemory(memSize); }
241270
uint32_t getLocalSize() const noexcept { return Args.getLocalSize(); }
242271

243272
void clearLocalSize() { Args.clearLocalSize(); }

0 commit comments

Comments
 (0)