@@ -65,8 +65,7 @@ class Surface;
6565uint32_t Kernel::dummyPatchLocation = 0xbaddf00d ;
6666
6767Kernel::Kernel (Program *programArg, const KernelInfoContainer &kernelInfosArg, bool schedulerKernel)
68- : slmTotalSize(kernelInfosArg[programArg->getDevices ()[0]->getRootDeviceIndex()]->workloadInfo.slmStaticSize),
69- isParentKernel(kernelInfosArg[programArg->getDevices ()[0]->getRootDeviceIndex()]->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue),
68+ : isParentKernel(kernelInfosArg[programArg->getDevices ()[0]->getRootDeviceIndex()]->kernelDescriptor.kernelAttributes.flags.usesDeviceSideEnqueue),
7069 isSchedulerKernel(schedulerKernel),
7170 executionEnvironment(programArg->getExecutionEnvironment ()),
7271 program(programArg),
@@ -78,7 +77,9 @@ Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, b
7877 program->retainForKernel ();
7978 imageTransformer.reset (new ImageTransformer);
8079 for (const auto &pClDevice : deviceVector) {
81- kernelDeviceInfos[pClDevice->getRootDeviceIndex ()].maxKernelWorkGroupSize = static_cast <uint32_t >(pClDevice->getSharedDeviceInfo ().maxWorkGroupSize );
80+ auto rootDeviceIndex = pClDevice->getRootDeviceIndex ();
81+ kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize = static_cast <uint32_t >(pClDevice->getSharedDeviceInfo ().maxWorkGroupSize );
82+ kernelDeviceInfos[rootDeviceIndex].slmTotalSize = kernelInfosArg[rootDeviceIndex]->workloadInfo .slmStaticSize ;
8283 }
8384}
8485
@@ -100,7 +101,7 @@ Kernel::~Kernel() {
100101 }
101102
102103 for (uint32_t i = 0 ; i < patchedArgumentsNum; i++) {
103- if (getDefaultKernelInfo (). kernelArgInfo . at (i). isSampler ) {
104+ if (SAMPLER_OBJ == getKernelArguments ()[i]. type ) {
104105 auto sampler = castToObject<Sampler>(kernelArguments.at (i).object );
105106 if (sampler) {
106107 sampler->decRefInternal ();
@@ -372,6 +373,8 @@ cl_int Kernel::initialize() {
372373 if (program->isKernelDebugEnabled () && kernelInfo.patchInfo .pAllocateSystemThreadSurface ) {
373374 debugEnabled = true ;
374375 }
376+ auto numArgs = kernelInfo.kernelArgInfo .size ();
377+ kernelDeviceInfo.slmSizes .resize (numArgs);
375378 isDeviceInitialized.set (rootDeviceIndex);
376379 }
377380
@@ -384,13 +387,11 @@ cl_int Kernel::initialize() {
384387 auto &defaultKernelInfo = getDefaultKernelInfo ();
385388 auto numArgs = defaultKernelInfo.kernelArgInfo .size ();
386389 kernelArguments.resize (numArgs);
387- slmSizes.resize (numArgs);
388390 kernelArgHandlers.resize (numArgs);
389391 kernelArgRequiresCacheFlush.resize (numArgs);
390392
391393 for (uint32_t i = 0 ; i < numArgs; ++i) {
392394 storeKernelArg (i, NONE_OBJ, nullptr , nullptr , 0 );
393- slmSizes[i] = 0 ;
394395
395396 // set the argument handler
396397 auto &argInfo = defaultKernelInfo.kernelArgInfo [i];
@@ -483,7 +484,6 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
483484 const _cl_context *ctxt;
484485 cl_uint refCount = 0 ;
485486 uint64_t nonCannonizedGpuAddress = 0llu;
486- auto defaultRootDeviceIndex = getDevices ()[0 ]->getRootDeviceIndex ();
487487 auto &defaultKernelInfo = getKernelInfo (defaultRootDeviceIndex);
488488
489489 switch (paramName) {
@@ -1112,7 +1112,7 @@ uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *local
11121112 availableThreadCount,
11131113 dssCount,
11141114 dssCount * KB * hardwareInfo.capabilityTable .slmSize ,
1115- hwHelper.alignSlmSize (slmTotalSize),
1115+ hwHelper.alignSlmSize (kernelDeviceInfos[rootDeviceIndex]. slmTotalSize ),
11161116 static_cast <uint32_t >(hwHelper.getMaxBarrierRegisterPerSlice ()),
11171117 hwHelper.getBarriersCountFromHasBarriers (barrierCount),
11181118 workDim,
@@ -1280,7 +1280,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
12801280
12811281 storeKernelArg (argIndex, SLM_OBJ, nullptr , argVal, argSize);
12821282
1283- slmSizes[argIndex] = argSize;
1283+ kernelDeviceInfos[rootDeviceIndex]. slmSizes [argIndex] = argSize;
12841284
12851285 // Extract our current slmOffset
12861286 auto slmOffset = *ptrOffset (crossThreadData,
@@ -1291,7 +1291,7 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
12911291
12921292 // Update all slm offsets after this argIndex
12931293 ++argIndex;
1294- while (argIndex < slmSizes.size ()) {
1294+ while (argIndex < kernelDeviceInfos[rootDeviceIndex]. slmSizes .size ()) {
12951295 const auto &kernelArgInfo = defaultKernelInfo.kernelArgInfo [argIndex];
12961296 auto slmAlignment = kernelArgInfo.slmAlignment ;
12971297
@@ -1306,11 +1306,11 @@ cl_int Kernel::setArgLocal(uint32_t argIndex,
13061306 *patchLocation = slmOffset;
13071307 }
13081308
1309- slmOffset += static_cast <uint32_t >(slmSizes[argIndex]);
1309+ slmOffset += static_cast <uint32_t >(kernelDeviceInfos[rootDeviceIndex]. slmSizes [argIndex]);
13101310 ++argIndex;
13111311 }
13121312
1313- slmTotalSize = defaultKernelInfo.workloadInfo .slmStaticSize + alignUp (slmOffset, KB);
1313+ kernelDeviceInfos[rootDeviceIndex]. slmTotalSize = defaultKernelInfo.workloadInfo .slmStaticSize + alignUp (slmOffset, KB);
13141314
13151315 return CL_SUCCESS;
13161316}
@@ -2679,4 +2679,7 @@ void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) {
26792679uint32_t Kernel::getMaxKernelWorkGroupSize (uint32_t rootDeviceIndex) const {
26802680 return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize ;
26812681}
2682+ uint32_t Kernel::getSlmTotalSize (uint32_t rootDeviceIndex) const {
2683+ return kernelDeviceInfos[rootDeviceIndex].slmTotalSize ;
2684+ }
26822685} // namespace NEO
0 commit comments