Skip to content

Commit 225e7f0

Browse files
Add checks for correct engine for concurrent kernels.
Related-To: NEO-5135 Change-Id: Ib1c37ec8d5e468de331521ae4be1cd92902a2330 Signed-off-by: Sebastian Luzynski <[email protected]>
1 parent f9a97cb commit 225e7f0

File tree

9 files changed

+93
-13
lines changed

9 files changed

+93
-13
lines changed

opencl/source/api/api.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5541,7 +5541,9 @@ cl_int CL_API_CALL clGetKernelMaxConcurrentWorkGroupCountINTEL(cl_command_queue
55415541
return retVal;
55425542
}
55435543

5544-
*suggestedWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
5544+
CommandQueue *pCommandQueue = nullptr;
5545+
WithCastToInternal(commandQueue, &pCommandQueue);
5546+
*suggestedWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
55455547

55465548
return retVal;
55475549
}
@@ -5579,6 +5581,13 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
55795581
return retVal;
55805582
}
55815583

5584+
auto &hardwareInfo = pKernel->getDevice().getHardwareInfo();
5585+
auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
5586+
if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) {
5587+
retVal = CL_INVALID_COMMAND_QUEUE;
5588+
return retVal;
5589+
}
5590+
55825591
size_t globalWorkSize[3];
55835592
for (size_t i = 0; i < workDim; i++) {
55845593
globalWorkSize[i] = workgroupCount[i] * localWorkSize[i];
@@ -5589,7 +5598,7 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
55895598
for (size_t i = 0; i < workDim; i++) {
55905599
requestedNumberOfWorkgroups *= workgroupCount[i];
55915600
}
5592-
size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
5601+
size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
55935602
if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) {
55945603
retVal = CL_INVALID_VALUE;
55955604
return retVal;

opencl/source/kernel/kernel.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1053,14 +1053,19 @@ void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *glob
10531053
localWorkSize[2] = suggestedLws.z;
10541054
}
10551055

1056-
uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const {
1056+
uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const {
10571057
auto &hardwareInfo = getDevice().getHardwareInfo();
1058+
auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
1059+
1060+
if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) {
1061+
return 0;
1062+
}
1063+
10581064
auto executionEnvironment = kernelInfo.patchInfo.executionEnvironment;
10591065
auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
10601066
if (dssCount == 0) {
10611067
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
10621068
}
1063-
auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
10641069
auto availableThreadCount = hwHelper.calculateAvailableThreadCount(
10651070
hardwareInfo.platform.eProductFamily,
10661071
((executionEnvironment != nullptr) ? executionEnvironment->NumGRFRequired : GrfConfig::DefaultGrfNumber),

opencl/source/kernel/kernel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ class Kernel : public BaseObject<_cl_kernel> {
408408
}
409409
void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
410410
size_t *localWorkSize);
411-
uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const;
411+
uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const;
412412

413413
uint64_t getKernelStartOffset(
414414
const bool localIdsGenerationByRuntime,

opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,15 +64,15 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting
6464
retVal = clGetKernelMaxConcurrentWorkGroupCountINTEL(pCommandQueue, pKernel, workDim, globalWorkOffset, localWorkSize,
6565
&maxConcurrentWorkGroupCount);
6666
EXPECT_EQ(CL_SUCCESS, retVal);
67-
size_t expectedMaxConcurrentWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
67+
size_t expectedMaxConcurrentWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
6868
EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
6969

7070
std::unique_ptr<MockKernel> pKernelWithExecutionEnvironmentPatch(MockKernel::create(pCommandQueue->getDevice(), pProgram));
7171
retVal = clGetKernelMaxConcurrentWorkGroupCountINTEL(pCommandQueue, pKernelWithExecutionEnvironmentPatch.get(), workDim,
7272
globalWorkOffset, localWorkSize,
7373
&maxConcurrentWorkGroupCount);
7474
EXPECT_EQ(CL_SUCCESS, retVal);
75-
expectedMaxConcurrentWorkGroupCount = pKernelWithExecutionEnvironmentPatch->getMaxWorkGroupCount(workDim, localWorkSize);
75+
expectedMaxConcurrentWorkGroupCount = pKernelWithExecutionEnvironmentPatch->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
7676
EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
7777
}
7878

opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreSetThenClEnqueueNDCountKernel
214214
cl_int retVal = CL_SUCCESS;
215215
CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
216216

217+
HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
218+
if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
219+
pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
220+
}
221+
217222
std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
218223
EXPECT_EQ(CL_SUCCESS, retVal);
219224

@@ -253,6 +258,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenNotAllArgsAreSetButSetKernelArgIsCalled
253258
cl_int retVal = CL_SUCCESS;
254259
CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
255260

261+
HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
262+
if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
263+
pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
264+
}
265+
256266
std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
257267
EXPECT_EQ(CL_SUCCESS, retVal);
258268

@@ -292,6 +302,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenSetKernelArgIsCalledForEachArgButAtLeas
292302
cl_int retVal = CL_SUCCESS;
293303
CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
294304

305+
HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
306+
if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
307+
pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
308+
}
309+
295310
std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
296311
EXPECT_EQ(CL_SUCCESS, retVal);
297312

opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,25 +25,56 @@ class MockSyncBufferHandler : public SyncBufferHandler {
2525
using SyncBufferHandler::usedBufferSize;
2626
};
2727

28-
class SyncBufferHandlerTest : public EnqueueHandlerTest {
28+
class SyncBufferEnqueueHandlerTest : public EnqueueHandlerTest {
29+
public:
30+
void SetUp() {
31+
hardwareInfo = *defaultHwInfo;
32+
uint64_t hwInfoConfig = defaultHardwareInfoConfigTable[productFamily];
33+
hardwareInfoSetup[productFamily](&hardwareInfo, true, hwInfoConfig);
34+
SetUpImpl(&hardwareInfo);
35+
}
36+
37+
void TearDown() {
38+
context->decRefInternal();
39+
delete pClDevice;
40+
pClDevice = nullptr;
41+
pDevice = nullptr;
42+
}
43+
44+
void SetUpImpl(const NEO::HardwareInfo *hardwareInfo) {
45+
pDevice = MockDevice::createWithNewExecutionEnvironment<MockDevice>(hardwareInfo);
46+
ASSERT_NE(nullptr, pDevice);
47+
pClDevice = new MockClDevice{pDevice};
48+
ASSERT_NE(nullptr, pClDevice);
49+
50+
auto &commandStreamReceiver = pDevice->getGpgpuCommandStreamReceiver();
51+
pTagMemory = commandStreamReceiver.getTagAddress();
52+
ASSERT_NE(nullptr, const_cast<uint32_t *>(pTagMemory));
53+
54+
context = new NEO::MockContext(pClDevice);
55+
}
56+
};
57+
58+
class SyncBufferHandlerTest : public SyncBufferEnqueueHandlerTest {
2959
public:
3060
void SetUp() override {}
3161
void TearDown() override {}
3262

3363
template <typename FamilyType>
3464
void SetUpT() {
35-
EnqueueHandlerTest::SetUp();
65+
SyncBufferEnqueueHandlerTest::SetUp();
3666
kernelInternals = std::make_unique<MockKernelWithInternals>(*pClDevice, context);
3767
kernel = kernelInternals->mockKernel;
3868
kernel->executionType = KernelExecutionType::Concurrent;
3969
commandQueue = reinterpret_cast<MockCommandQueue *>(new MockCommandQueueHw<FamilyType>(context, pClDevice, 0));
70+
hwHelper = &HwHelper::get(kernel->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
4071
}
4172

4273
template <typename FamilyType>
4374
void TearDownT() {
4475
commandQueue->release();
4576
kernelInternals.reset();
46-
EnqueueHandlerTest::TearDown();
77+
SyncBufferEnqueueHandlerTest::TearDown();
4778
}
4879

4980
void patchAllocateSyncBuffer() {
@@ -61,6 +92,10 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest {
6192
return clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
6293
}
6394

95+
bool isCooperativeDispatchSupported() {
96+
return hwHelper->isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), kernel->getDevice().getHardwareInfo().platform.eProductFamily);
97+
}
98+
6499
const cl_uint workDim = 1;
65100
const size_t gwOffset[3] = {0, 0, 0};
66101
const size_t lws[3] = {10, 1, 1};
@@ -71,6 +106,7 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest {
71106
MockKernel *kernel;
72107
MockCommandQueue *commandQueue;
73108
SPatchAllocateSyncBuffer sPatchAllocateSyncBuffer;
109+
HwHelper *hwHelper;
74110
};
75111

76112
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenAllocateSyncBufferPatchAndConcurrentKernelWhenEnqueuingKernelThenSyncBufferIsUsed) {
@@ -109,7 +145,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenConcurrentKernelWithAllocateSyncB
109145
}
110146

111147
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingConcurrentKernelThenSuccessIsReturned) {
112-
auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws);
148+
auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue);
113149
workgroupCount[0] = maxWorkGroupCount;
114150
globalWorkSize[0] = maxWorkGroupCount * lws[0];
115151

@@ -118,7 +154,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingCon
118154
}
119155

120156
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingConcurrentKernelThenErrorIsReturned) {
121-
size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws);
157+
size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue);
122158
workgroupCount[0] = maxWorkGroupCount + 1;
123159
globalWorkSize[0] = maxWorkGroupCount * lws[0] + 1;
124160

opencl/test/unit_test/gtpin/gtpin_tests.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include "opencl/source/gtpin/gtpin_hw_helper.h"
2525
#include "opencl/source/gtpin/gtpin_init.h"
2626
#include "opencl/source/gtpin/gtpin_notify.h"
27+
#include "opencl/source/helpers/validators.h"
2728
#include "opencl/source/kernel/kernel.h"
2829
#include "opencl/source/mem_obj/buffer.h"
2930
#include "opencl/source/program/create.inl"
@@ -913,7 +914,13 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT
913914

914915
cl_uint workDim = 1;
915916
size_t localWorkSize[3] = {1, 1, 1};
916-
size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize);
917+
CommandQueue *commandQueue = nullptr;
918+
WithCastToInternal(cmdQ, &commandQueue);
919+
HwHelper &hwHelper = HwHelper::get(pDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
920+
if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), pDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
921+
commandQueue->getGpgpuEngine().osContext = commandQueue->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
922+
}
923+
size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize, commandQueue);
917924
auto buff10 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr);
918925
auto buff11 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr);
919926

shared/source/helpers/hw_helper.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ class HwHelper {
126126
virtual bool useOnlyGlobalTimestamps() const = 0;
127127
virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0;
128128
virtual bool packedFormatsSupported() const = 0;
129+
virtual bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const = 0;
129130

130131
static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo);
131132
static uint32_t getEnginesCount(const HardwareInfo &hwInfo);
@@ -308,6 +309,8 @@ class HwHelperHw : public HwHelper {
308309

309310
bool packedFormatsSupported() const override;
310311

312+
bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const override;
313+
311314
protected:
312315
LocalMemoryAccessMode getDefaultLocalMemoryAccessMode(const HardwareInfo &hwInfo) const override;
313316

shared/source/helpers/hw_helper_base.inl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -504,4 +504,9 @@ bool MemorySynchronizationCommands<GfxFamily>::isPipeControlPriorToPipelineSelec
504504
return false;
505505
}
506506

507+
template <typename GfxFamily>
508+
inline bool NEO::HwHelperHw<GfxFamily>::isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const {
509+
return true;
510+
}
511+
507512
} // namespace NEO

0 commit comments

Comments
 (0)