Skip to content

Commit be7ae13

Browse files
Store SLM sizes per root device in Kernel
reduce usages of getDefaultKernelInfo Related-To: NEO-5001 Signed-off-by: Mateusz Jablonski <[email protected]>
1 parent 09bdd2a commit be7ae13

15 files changed

+48
-39
lines changed

opencl/source/api/api.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1845,7 +1845,7 @@ cl_int CL_API_CALL clSetKernelArg(cl_kernel kernel,
18451845
retVal = CL_INVALID_KERNEL;
18461846
break;
18471847
}
1848-
if (pKernel->getDefaultKernelInfo().kernelArgInfo.size() <= argIndex) {
1848+
if (pKernel->getKernelArguments().size() <= argIndex) {
18491849
retVal = CL_INVALID_ARG_INDEX;
18501850
break;
18511851
}

opencl/source/helpers/dispatch_info.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
namespace NEO {
1313
bool DispatchInfo::usesSlm() const {
14-
return (kernel == nullptr) ? false : kernel->slmTotalSize > 0;
14+
return (kernel == nullptr) ? false : kernel->getSlmTotalSize(pClDevice->getRootDeviceIndex()) > 0;
1515
}
1616

1717
bool DispatchInfo::usesStatelessPrintfSurface() const {

opencl/source/helpers/hardware_commands_helper_base.inl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,10 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
180180

181181
interfaceDescriptor.setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
182182

183+
auto slmTotalSize = kernel.getSlmTotalSize(rootDeviceIndex);
184+
183185
setGrfInfo(&interfaceDescriptor, kernel, sizeCrossThreadData, sizePerThreadData, rootDeviceIndex);
184-
EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, hardwareInfo, threadsPerThreadGroup, kernel.slmTotalSize, SlmPolicy::SlmPolicyNone);
186+
EncodeDispatchKernel<GfxFamily>::appendAdditionalIDDFields(&interfaceDescriptor, hardwareInfo, threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone);
185187

186188
interfaceDescriptor.setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
187189

@@ -190,7 +192,7 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
190192
EncodeDispatchKernel<GfxFamily>::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize);
191193

192194
auto programmableIDSLMSize =
193-
static_cast<SHARED_LOCAL_MEMORY_SIZE>(HwHelperHw<GfxFamily>::get().computeSlmValues(hardwareInfo, kernel.slmTotalSize));
195+
static_cast<SHARED_LOCAL_MEMORY_SIZE>(HwHelperHw<GfxFamily>::get().computeSlmValues(hardwareInfo, slmTotalSize));
194196

195197
interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize);
196198
EncodeDispatchKernel<GfxFamily>::programBarrierEnable(interfaceDescriptor,

opencl/source/helpers/task_information.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
166166
printfHandler.get()->makeResident(commandStreamReceiver);
167167
}
168168
makeTimestampPacketsResident(commandStreamReceiver);
169+
auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
169170

170171
if (executionModelKernel) {
171172
uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
@@ -195,7 +196,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
195196
scheduler.makeResident(commandStreamReceiver);
196197

197198
// Update SLM usage
198-
slmUsed |= scheduler.slmTotalSize > 0;
199+
slmUsed |= scheduler.getSlmTotalSize(rootDeviceIndex) > 0;
199200

200201
this->kernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(commandStreamReceiver);
201202
}
@@ -210,7 +211,6 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
210211
commandQueue.getGpgpuCommandStreamReceiver(), bcsCsr);
211212
}
212213

213-
auto rootDeviceIndex = commandQueue.getDevice().getRootDeviceIndex();
214214
const auto &kernelDescriptor = kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor;
215215

216216
auto memoryCompressionState = commandStreamReceiver.getMemoryCompressionState(kernel->isAuxTranslationRequired());

opencl/source/kernel/kernel.cpp

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,7 @@ class Surface;
6565
uint32_t Kernel::dummyPatchLocation = 0xbaddf00d;
6666

6767
Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, bool schedulerKernel)
68-
: slmTotalSize(kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->workloadInfo.slmStaticSize),
69-
isParentKernel(kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue),
68+
: isParentKernel(kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue),
7069
isSchedulerKernel(schedulerKernel),
7170
executionEnvironment(programArg->getExecutionEnvironment()),
7271
program(programArg),
@@ -78,7 +77,9 @@ Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, b
7877
program->retainForKernel();
7978
imageTransformer.reset(new ImageTransformer);
8079
for (const auto &pClDevice : deviceVector) {
81-
kernelDeviceInfos[pClDevice->getRootDeviceIndex()].maxKernelWorkGroupSize = static_cast<uint32_t>(pClDevice->getSharedDeviceInfo().maxWorkGroupSize);
80+
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
81+
kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast<uint32_t>(pClDevice->getSharedDeviceInfo().maxWorkGroupSize);
82+
kernelDeviceInfos[rootDeviceIndex].slmTotalSize = kernelInfosArg[rootDeviceIndex]->workloadInfo.slmStaticSize;
8283
}
8384
}
8485

@@ -100,7 +101,7 @@ Kernel::~Kernel() {
100101
}
101102

102103
for (uint32_t i = 0; i < patchedArgumentsNum; i++) {
103-
if (getDefaultKernelInfo().kernelArgInfo.at(i).isSampler) {
104+
if (SAMPLER_OBJ == getKernelArguments()[i].type) {
104105
auto sampler = castToObject<Sampler>(kernelArguments.at(i).object);
105106
if (sampler) {
106107
sampler->decRefInternal();
@@ -372,6 +373,8 @@ cl_int Kernel::initialize() {
372373
if (program->isKernelDebugEnabled() && kernelInfo.patchInfo.pAllocateSystemThreadSurface) {
373374
debugEnabled = true;
374375
}
376+
auto numArgs = kernelInfo.kernelArgInfo.size();
377+
kernelDeviceInfo.slmSizes.resize(numArgs);
375378
isDeviceInitialized.set(rootDeviceIndex);
376379
}
377380

@@ -384,13 +387,11 @@ cl_int Kernel::initialize() {
384387
auto &defaultKernelInfo = getDefaultKernelInfo();
385388
auto numArgs = defaultKernelInfo.kernelArgInfo.size();
386389
kernelArguments.resize(numArgs);
387-
slmSizes.resize(numArgs);
388390
kernelArgHandlers.resize(numArgs);
389391
kernelArgRequiresCacheFlush.resize(numArgs);
390392

391393
for (uint32_t i = 0; i < numArgs; ++i) {
392394
storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0);
393-
slmSizes[i] = 0;
394395

395396
// set the argument handler
396397
auto &argInfo = defaultKernelInfo.kernelArgInfo[i];
@@ -483,7 +484,6 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
483484
const _cl_context *ctxt;
484485
cl_uint refCount = 0;
485486
uint64_t nonCannonizedGpuAddress = 0llu;
486-
auto defaultRootDeviceIndex = getDevices()[0]->getRootDeviceIndex();
487487
auto &defaultKernelInfo = getKernelInfo(defaultRootDeviceIndex);
488488

489489
switch (paramName) {
@@ -1112,7 +1112,7 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
11121112
availableThreadCount,
11131113
dssCount,
11141114
dssCount * KB * hardwareInfo.capabilityTable.slmSize,
1115-
hwHelper.alignSlmSize(slmTotalSize),
1115+
hwHelper.alignSlmSize(kernelDeviceInfos[rootDeviceIndex].slmTotalSize),
11161116
static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
11171117
hwHelper.getBarriersCountFromHasBarriers(barrierCount),
11181118
workDim,
@@ -1280,7 +1280,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
12801280

12811281
storeKernelArg(argIndex, SLM_OBJ, nullptr, argVal, argSize);
12821282

1283-
slmSizes[argIndex] = argSize;
1283+
kernelDeviceInfos[rootDeviceIndex].slmSizes[argIndex] = argSize;
12841284

12851285
// Extract our current slmOffset
12861286
auto slmOffset = *ptrOffset(crossThreadData,
@@ -1291,7 +1291,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
12911291

12921292
// Update all slm offsets after this argIndex
12931293
++argIndex;
1294-
while (argIndex < slmSizes.size()) {
1294+
while (argIndex < kernelDeviceInfos[rootDeviceIndex].slmSizes.size()) {
12951295
const auto &kernelArgInfo = defaultKernelInfo.kernelArgInfo[argIndex];
12961296
auto slmAlignment = kernelArgInfo.slmAlignment;
12971297

@@ -1306,11 +1306,11 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
13061306
*patchLocation = slmOffset;
13071307
}
13081308

1309-
slmOffset += static_cast<uint32_t>(slmSizes[argIndex]);
1309+
slmOffset += static_cast<uint32_t>(kernelDeviceInfos[rootDeviceIndex].slmSizes[argIndex]);
13101310
++argIndex;
13111311
}
13121312

1313-
slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB);
1313+
kernelDeviceInfos[rootDeviceIndex].slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB);
13141314

13151315
return CL_SUCCESS;
13161316
}
@@ -2679,4 +2679,7 @@ void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) {
26792679
uint32_t Kernel::getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const {
26802680
return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize;
26812681
}
2682+
uint32_t Kernel::getSlmTotalSize(uint32_t rootDeviceIndex) const {
2683+
return kernelDeviceInfos[rootDeviceIndex].slmTotalSize;
2684+
}
26822685
} // namespace NEO

opencl/source/kernel/kernel.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,7 @@ class Kernel : public BaseObject<_cl_kernel> {
182182
}
183183

184184
size_t getKernelArgsNumber() const {
185-
return getDefaultKernelInfo().kernelArgInfo.size();
185+
return kernelArguments.size();
186186
}
187187

188188
bool requiresSshForBuffers(uint32_t rootDeviceIndex) const {
@@ -308,11 +308,8 @@ class Kernel : public BaseObject<_cl_kernel> {
308308

309309
static uint32_t dummyPatchLocation;
310310

311-
std::vector<size_t> slmSizes;
312-
313311
uint32_t allBufferArgsStateful = CL_TRUE;
314312

315-
uint32_t slmTotalSize;
316313
bool isBuiltIn = false;
317314
const bool isParentKernel;
318315
const bool isSchedulerKernel;
@@ -406,6 +403,7 @@ class Kernel : public BaseObject<_cl_kernel> {
406403
void setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ);
407404
void setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim);
408405
uint32_t getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const;
406+
uint32_t getSlmTotalSize(uint32_t rootDeviceIndex) const;
409407

410408
protected:
411409
struct ObjectCounts {
@@ -574,6 +572,9 @@ class Kernel : public BaseObject<_cl_kernel> {
574572
size_t numberOfBindingTableStates = 0u;
575573
size_t localBindingTableOffset = 0u;
576574

575+
std::vector<size_t> slmSizes;
576+
uint32_t slmTotalSize = 0u;
577+
577578
std::unique_ptr<char[]> pSshLocal;
578579
uint32_t sshLocalSize = 0u;
579580
char *crossThreadData = nullptr;

opencl/source/program/kernel_info.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) {
138138
this->maxWorkGroupSize = dispatchInfo.getKernel()->getMaxKernelWorkGroupSize(rootDeviceIndex);
139139
this->hasBarriers = kernelInfo.kernelDescriptor.kernelAttributes.usesBarriers();
140140
this->simdSize = static_cast<uint32_t>(kernelInfo.getMaxSimdSize());
141-
this->slmTotalSize = static_cast<uint32_t>(dispatchInfo.getKernel()->slmTotalSize);
141+
this->slmTotalSize = static_cast<uint32_t>(dispatchInfo.getKernel()->getSlmTotalSize(rootDeviceIndex));
142142
this->coreFamily = device.getHardwareInfo().platform.eRenderCoreFamily;
143143
this->numThreadsPerSubSlice = static_cast<uint32_t>(device.getSharedDeviceInfo().maxNumEUsPerSubSlice) *
144144
device.getSharedDeviceInfo().numThreadsPerEU;

opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_2_tests.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, GivenKernelWithSlmWhenPreviousSLML
380380
commandStreamReceiver->lastSentL3Config = L3Config;
381381
commandStreamReceiver->lastSentThreadArbitrationPolicy = kernel.mockKernel->getThreadArbitrationPolicy();
382382

383-
((MockKernel *)kernel)->setTotalSLMSize(1024);
383+
((MockKernel *)kernel)->setTotalSLMSize(rootDeviceIndex, 1024);
384384

385385
cmdList.clear();
386386
commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr);

opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests.inl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ void CommandStreamReceiverHwTest<GfxFamily>::givenKernelWithSlmWhenPreviousNOSLM
4343
commandStreamReceiver->isPreambleSent = true;
4444
commandStreamReceiver->lastSentL3Config = 0;
4545

46-
static_cast<MockKernel *>(kernel)->setTotalSLMSize(1024);
46+
static_cast<MockKernel *>(kernel)->setTotalSLMSize(rootDeviceIndex, 1024);
4747

4848
cmdList.clear();
4949
commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 0, nullptr, nullptr);
@@ -89,7 +89,7 @@ void CommandStreamReceiverHwTest<GfxFamily>::givenBlockedKernelWithSlmWhenPrevio
8989
commandStreamReceiver->isPreambleSent = true;
9090
commandStreamReceiver->lastSentL3Config = 0;
9191

92-
static_cast<MockKernel *>(kernel)->setTotalSLMSize(1024);
92+
static_cast<MockKernel *>(kernel)->setTotalSLMSize(rootDeviceIndex, 1024);
9393

9494
commandQueue.enqueueKernel(kernel, 1, nullptr, &GWS, nullptr, 1, &blockingEvent, nullptr);
9595

opencl/test/unit_test/helpers/dispatch_info_builder_tests.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ class DispatchInfoBuilderFixture : public ContextFixture, public ClDeviceFixture
6868
pKernel->setCrossThreadData(pCrossThreadData, sizeof(pCrossThreadData));
6969
pKernel->setKernelArgHandler(0, &Kernel::setArgBuffer);
7070

71-
pKernel->slmTotalSize = 128;
71+
pKernel->kernelDeviceInfos[rootDeviceIndex].slmTotalSize = 128;
7272
pKernel->isBuiltIn = true;
7373
}
7474

0 commit comments

Comments
 (0)