Skip to content

Commit 69bef97

Browse files
OCL cache per thread data
Add caching mechanism for local ids in OCL. Signed-off-by: Krystian Chmielewski <[email protected]>
1 parent 8a9ea9a commit 69bef97

21 files changed

+308
-501
lines changed

opencl/source/command_queue/hardware_interface_bdw_and_later.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ inline void HardwareInterface<GfxFamily>::programWalker(
9090
walkerArgs.preemptionMode,
9191
&walkerCmd,
9292
nullptr,
93-
true,
93+
kernelUsesLocalIds,
9494
commandQueue.getDevice());
9595

9696
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo().kernelDescriptor,

opencl/source/command_queue/hardware_interface_xehp_and_later.inl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -63,15 +63,16 @@ inline void HardwareInterface<GfxFamily>::programWalker(
6363
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
6464
uint32_t requiredWalkOrder = 0u;
6565

66-
bool localIdsGenerationByRuntime = EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
67-
numChannels,
68-
walkerArgs.localWorkSizes,
69-
std::array<uint8_t, 3>{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
70-
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
71-
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
72-
kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
73-
requiredWalkOrder,
74-
simd);
66+
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
67+
bool localIdsGenerationByRuntime = kernelUsesLocalIds && EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
68+
numChannels,
69+
walkerArgs.localWorkSizes,
70+
std::array<uint8_t, 3>{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
71+
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
72+
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
73+
kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
74+
requiredWalkOrder,
75+
simd);
7576

7677
bool inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
7778
auto idd = &walkerCmd.getInterfaceDescriptor();
@@ -83,7 +84,6 @@ inline void HardwareInterface<GfxFamily>::programWalker(
8384
}
8485

8586
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
86-
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
8787

8888
const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
8989
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {

opencl/source/helpers/hardware_commands_helper.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -90,23 +90,12 @@ struct HardwareCommandsHelper : public PerThreadDataHelper {
9090
const Device &device);
9191

9292
static void programPerThreadData(
93+
bool localIdsGenerationByRuntime,
9394
size_t &sizePerThreadData,
94-
const bool &localIdsGenerationByRuntime,
95-
LinearStream &ioh,
96-
uint32_t &simd,
97-
uint32_t &numChannels,
98-
const size_t localWorkSize[3],
99-
Kernel &kernel,
100-
size_t &sizePerThreadDataTotal,
101-
size_t &localWorkItems,
102-
uint32_t rootDeviceIndex);
103-
104-
static void updatePerThreadDataTotal(
105-
size_t &sizePerThreadData,
106-
uint32_t &simd,
107-
uint32_t &numChannels,
10895
size_t &sizePerThreadDataTotal,
109-
size_t &localWorkItems);
96+
LinearStream &ioh,
97+
const Kernel &kernel,
98+
const size_t localWorkSize[3]);
11099

111100
static size_t getSizeRequiredCS();
112101
static size_t getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress);

opencl/source/helpers/hardware_commands_helper_base.inl

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,25 @@ size_t HardwareCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
179179
return (size_t)offsetInterfaceDescriptor;
180180
}
181181

182+
template <typename GfxFamily>
183+
void HardwareCommandsHelper<GfxFamily>::programPerThreadData(
184+
bool localIdsGenerationByRuntime,
185+
size_t &sizePerThreadData,
186+
size_t &sizePerThreadDataTotal,
187+
LinearStream &ioh,
188+
const Kernel &kernel,
189+
const size_t localWorkSize[3]) {
190+
if (localIdsGenerationByRuntime) {
191+
Vec3<uint16_t> group = {static_cast<uint16_t>(localWorkSize[0]),
192+
static_cast<uint16_t>(localWorkSize[1]),
193+
static_cast<uint16_t>(localWorkSize[2])};
194+
sizePerThreadData = kernel.getLocalIdsSizePerThread();
195+
sizePerThreadDataTotal = kernel.getLocalIdsSizeForGroup(group);
196+
auto dest = ioh.getSpace(sizePerThreadDataTotal);
197+
kernel.setLocalIdsForGroup(group, dest);
198+
}
199+
}
200+
182201
template <typename GfxFamily>
183202
size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
184203
LinearStream &commandStream,
@@ -200,8 +219,6 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
200219

201220
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
202221

203-
auto rootDeviceIndex = device.getRootDeviceIndex();
204-
205222
DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32);
206223
auto inlineDataProgrammingRequired = HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
207224

@@ -228,7 +245,6 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
228245

229246
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
230247
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
231-
auto numChannels = static_cast<uint32_t>(kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels);
232248

233249
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
234250

@@ -240,16 +256,12 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
240256
size_t sizePerThreadData = 0;
241257

242258
HardwareCommandsHelper<GfxFamily>::programPerThreadData(
243-
sizePerThreadData,
244259
localIdsGenerationByRuntime,
260+
sizePerThreadData,
261+
sizePerThreadDataTotal,
245262
ioh,
246-
simd,
247-
numChannels,
248-
localWorkSize,
249263
kernel,
250-
sizePerThreadDataTotal,
251-
localWorkItems,
252-
rootDeviceIndex);
264+
localWorkSize);
253265

254266
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
255267

@@ -296,23 +308,6 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
296308
return offsetCrossThreadData;
297309
}
298310

299-
template <typename GfxFamily>
300-
void HardwareCommandsHelper<GfxFamily>::updatePerThreadDataTotal(
301-
size_t &sizePerThreadData,
302-
uint32_t &simd,
303-
uint32_t &numChannels,
304-
size_t &sizePerThreadDataTotal,
305-
size_t &localWorkItems) {
306-
uint32_t grfSize = sizeof(typename GfxFamily::GRF);
307-
sizePerThreadData = getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
308-
309-
uint32_t localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, grfSize, numChannels);
310-
localIdSizePerThread = std::max(localIdSizePerThread, grfSize);
311-
312-
sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread;
313-
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
314-
}
315-
316311
template <typename GfxFamily>
317312
bool HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
318313
auto checkKernelForInlineData = true;

opencl/source/helpers/hardware_commands_helper_bdw_and_later.inl

Lines changed: 0 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -74,35 +74,6 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
7474
}
7575
}
7676

77-
template <typename GfxFamily>
78-
void HardwareCommandsHelper<GfxFamily>::programPerThreadData(
79-
size_t &sizePerThreadData,
80-
const bool &localIdsGenerationByRuntime,
81-
LinearStream &ioh,
82-
uint32_t &simd,
83-
uint32_t &numChannels,
84-
const size_t localWorkSize[3],
85-
Kernel &kernel,
86-
size_t &sizePerThreadDataTotal,
87-
size_t &localWorkItems,
88-
uint32_t rootDeviceIndex) {
89-
90-
uint32_t grfSize = sizeof(typename GfxFamily::GRF);
91-
92-
sendPerThreadData(
93-
ioh,
94-
simd,
95-
grfSize,
96-
numChannels,
97-
std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSize[0]), static_cast<uint16_t>(localWorkSize[1]), static_cast<uint16_t>(localWorkSize[2])}},
98-
std::array<uint8_t, 3>{{kernel.getKernelInfo().kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0],
99-
kernel.getKernelInfo().kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1],
100-
kernel.getKernelInfo().kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}},
101-
kernel.usesOnlyImages());
102-
103-
updatePerThreadDataTotal(sizePerThreadData, simd, numChannels, sizePerThreadDataTotal, localWorkItems);
104-
}
105-
10677
template <typename GfxFamily>
10778
size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
10879
IndirectHeap &indirectHeap,

opencl/source/helpers/hardware_commands_helper_xehp_and_later.inl

Lines changed: 0 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -58,33 +58,6 @@ void HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
5858
size_t sizeInterfaceDescriptorData) {
5959
}
6060

61-
template <typename GfxFamily>
62-
void HardwareCommandsHelper<GfxFamily>::programPerThreadData(
63-
size_t &sizePerThreadData,
64-
const bool &localIdsGenerationByRuntime,
65-
LinearStream &ioh,
66-
uint32_t &simd,
67-
uint32_t &numChannels,
68-
const size_t localWorkSize[3],
69-
Kernel &kernel,
70-
size_t &sizePerThreadDataTotal,
71-
size_t &localWorkItems,
72-
uint32_t rootDeviceIndex) {
73-
if (localIdsGenerationByRuntime) {
74-
constexpr uint32_t grfSize = sizeof(typename GfxFamily::GRF);
75-
sendPerThreadData(
76-
ioh,
77-
simd,
78-
grfSize,
79-
numChannels,
80-
std::array<uint16_t, 3>{{static_cast<uint16_t>(localWorkSize[0]), static_cast<uint16_t>(localWorkSize[1]), static_cast<uint16_t>(localWorkSize[2])}},
81-
{{0u, 1u, 2u}},
82-
kernel.usesOnlyImages());
83-
84-
updatePerThreadDataTotal(sizePerThreadData, simd, numChannels, sizePerThreadDataTotal, localWorkItems);
85-
}
86-
}
87-
8861
template <typename GfxFamily>
8962
size_t HardwareCommandsHelper<GfxFamily>::sendCrossThreadData(
9063
IndirectHeap &indirectHeap,

opencl/source/kernel/kernel.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "shared/source/helpers/ptr_math.h"
2323
#include "shared/source/helpers/surface_format_info.h"
2424
#include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h"
25+
#include "shared/source/kernel/local_ids_cache.h"
2526
#include "shared/source/memory_manager/memory_manager.h"
2627
#include "shared/source/memory_manager/unified_memory_manager.h"
2728
#include "shared/source/os_interface/hw_info_config.h"
@@ -314,6 +315,10 @@ cl_int Kernel::initialize() {
314315
usingImagesOnly = true;
315316
}
316317

318+
if (kernelDescriptor.kernelAttributes.numLocalIdChannels > 0) {
319+
initializeLocalIdsCache();
320+
}
321+
317322
return CL_SUCCESS;
318323
}
319324

@@ -2293,4 +2298,29 @@ bool Kernel::graphicsAllocationTypeUseSystemMemory(AllocationType type) {
22932298
(type == AllocationType::SVM_ZERO_COPY);
22942299
}
22952300

2301+
void Kernel::initializeLocalIdsCache() {
2302+
auto workgroupDimensionsOrder = getDescriptor().kernelAttributes.workgroupDimensionsOrder;
2303+
std::array<uint8_t, 3> wgDimOrder = {workgroupDimensionsOrder[0],
2304+
workgroupDimensionsOrder[1],
2305+
workgroupDimensionsOrder[2]};
2306+
auto simdSize = getDescriptor().kernelAttributes.simdSize;
2307+
auto grfSize = static_cast<uint8_t>(getDevice().getHardwareInfo().capabilityTable.grfSize);
2308+
localIdsCache = std::make_unique<LocalIdsCache>(4, wgDimOrder, simdSize, grfSize, usingImagesOnly);
2309+
}
2310+
2311+
void Kernel::setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const {
2312+
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
2313+
localIdsCache->setLocalIdsForGroup(groupSize, destination);
2314+
}
2315+
2316+
size_t Kernel::getLocalIdsSizeForGroup(const Vec3<uint16_t> &groupSize) const {
2317+
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
2318+
return localIdsCache->getLocalIdsSizeForGroup(groupSize);
2319+
}
2320+
2321+
size_t Kernel::getLocalIdsSizePerThread() const {
2322+
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
2323+
return localIdsCache->getLocalIdsSizePerThread();
2324+
}
2325+
22962326
} // namespace NEO

opencl/source/kernel/kernel.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class ImageTransformer;
4040
class Surface;
4141
class PrintfHandler;
4242
class MultiDeviceKernel;
43+
class LocalIdsCache;
4344

4445
class Kernel : public ReferenceTrackedObject<Kernel> {
4546
public:
@@ -417,6 +418,10 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
417418
return isDestinationAllocationInSystemMemory;
418419
}
419420

421+
void setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const;
422+
size_t getLocalIdsSizeForGroup(const Vec3<uint16_t> &groupSize) const;
423+
size_t getLocalIdsSizePerThread() const;
424+
420425
protected:
421426
struct KernelConfig {
422427
Vec3<size_t> gws;
@@ -482,6 +487,9 @@ class Kernel : public ReferenceTrackedObject<Kernel> {
482487
bool hasTunningFinished(KernelSubmissionData &submissionData);
483488
bool hasRunFinished(TimestampPacketContainer *timestampContainer);
484489

490+
void initializeLocalIdsCache();
491+
std::unique_ptr<LocalIdsCache> localIdsCache;
492+
485493
UnifiedMemoryControls unifiedMemoryControls{};
486494

487495
std::map<uint32_t, MemObj *> migratableArgsMap{};

opencl/test/unit_test/command_queue/dispatch_walker_tests_xehp_and_later.cpp

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -684,10 +684,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
684684

685685
auto &kd = kernel->kernelInfo.kernelDescriptor;
686686
kd.kernelAttributes.flags.passInlineData = true;
687-
kd.kernelAttributes.localId[0] = 1;
688-
kd.kernelAttributes.localId[1] = 0;
689-
kd.kernelAttributes.localId[2] = 0;
690-
kd.kernelAttributes.numLocalIdChannels = 1;
691687

692688
kernel->mockKernel->setCrossThreadData(crossThreadDataGrf, sizeof(INLINE_DATA));
693689

@@ -722,7 +718,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
722718

723719
uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize();
724720
// only X is present
725-
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf, 1);
721+
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, sizeGrf);
726722
sizePerThreadData = std::max(sizePerThreadData, sizeGrf);
727723
size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData;
728724

@@ -825,7 +821,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
825821

826822
uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize();
827823
// only X is present
828-
uint32_t localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, sizeGrf, 1);
824+
uint32_t localIdSizePerThread = getPerThreadSizeLocalIDs(simd, sizeGrf);
829825
localIdSizePerThread = std::max(localIdSizePerThread, sizeGrf);
830826
auto sizePerThreadData = getThreadsPerWG(simd, lws[0]) * localIdSizePerThread;
831827

@@ -840,7 +836,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
840836
memoryManager->freeGraphicsMemory(kernel->kernelInfo.kernelAllocation);
841837
}
842838

843-
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlineDataEnabledWhenNoLocalIdsUsedThenExpectCrossThreadDataInWalkerAndNoEmitLocalFieldSet) {
839+
HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenKernelWithoutLocalIdsAndPassInlineDataEnabledWhenNoHWGenerationOfLocalIdsUsedThenExpectCrossThreadDataInWalkerAndNoEmitLocalFieldSet) {
844840
using COMPUTE_WALKER = typename FamilyType::COMPUTE_WALKER;
845841
using INLINE_DATA = typename FamilyType::INLINE_DATA;
846842

@@ -851,9 +847,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
851847

852848
auto &kd = kernel->kernelInfo.kernelDescriptor;
853849
kd.kernelAttributes.flags.passInlineData = true;
854-
kd.kernelAttributes.localId[0] = 0;
855-
kd.kernelAttributes.localId[1] = 0;
856-
kd.kernelAttributes.localId[2] = 0;
857850
kd.kernelAttributes.numLocalIdChannels = 0;
858851

859852
kernel->mockKernel->setCrossThreadData(crossThreadDataGrf, sizeof(INLINE_DATA));
@@ -878,11 +871,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
878871

879872
EXPECT_EQ(0, memcmp(walker->getInlineDataPointer(), crossThreadDataGrf, sizeof(INLINE_DATA)));
880873

881-
uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize();
882-
// only X is present
883-
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, 1);
884-
sizePerThreadData = std::max(sizePerThreadData, sizeGrf);
885-
size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData;
874+
size_t perThreadTotalDataSize = 0U;
886875
uint32_t expectedIndirectDataLength = static_cast<uint32_t>(perThreadTotalDataSize);
887876
expectedIndirectDataLength = alignUp(expectedIndirectDataLength, COMPUTE_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
888877
EXPECT_EQ(expectedIndirectDataLength, walker->getIndirectDataLength());
@@ -902,9 +891,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
902891

903892
auto &kd = kernel->kernelInfo.kernelDescriptor;
904893
kd.kernelAttributes.flags.passInlineData = true;
905-
kd.kernelAttributes.localId[0] = 0;
906-
kd.kernelAttributes.localId[1] = 0;
907-
kd.kernelAttributes.localId[2] = 0;
908894
kd.kernelAttributes.numLocalIdChannels = 0;
909895

910896
kernel->mockKernel->setCrossThreadData(crossThreadDataTwoGrf, sizeof(INLINE_DATA) * 2);
@@ -931,11 +917,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterDispatchWalkerBasicTest, givenPassInlin
931917
void *payloadData = ih.getCpuBase();
932918
EXPECT_EQ(0, memcmp(payloadData, &crossThreadDataTwoGrf[sizeof(INLINE_DATA) / sizeof(uint32_t)], sizeof(INLINE_DATA)));
933919

934-
uint32_t simd = kernel->mockKernel->getKernelInfo().getMaxSimdSize();
935-
// only X is present
936-
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, 1);
937-
sizePerThreadData = std::max(sizePerThreadData, sizeGrf);
938-
size_t perThreadTotalDataSize = getThreadsPerWG(simd, lws[0]) * sizePerThreadData;
920+
size_t perThreadTotalDataSize = 0;
939921

940922
// second GRF in indirect
941923
uint32_t expectedIndirectDataLength = static_cast<uint32_t>(perThreadTotalDataSize + sizeof(INLINE_DATA));

0 commit comments

Comments
 (0)