Add checks for correct engine for concurrent kernels.

SebastianLuzynski · Compute-Runtime-Automation · commit 225e7f01b4fa · 2020-10-26T14:53:52.000+01:00
Related-To: NEO-5135
Change-Id: Ib1c37ec8d5e468de331521ae4be1cd92902a2330
Signed-off-by: Sebastian Luzynski &lt;sebastian.jozef.luzynski@intel.com&gt;
diff --git a/opencl/source/api/api.cpp b/opencl/source/api/api.cpp
@@ -5541,7 +5541,9 @@ cl_int CL_API_CALL clGetKernelMaxConcurrentWorkGroupCountINTEL(cl_command_queue
         return retVal;
     }
 
-    *suggestedWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
+    CommandQueue *pCommandQueue = nullptr;
+    WithCastToInternal(commandQueue, &pCommandQueue);
+    *suggestedWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
 
     return retVal;
 }
@@ -5579,6 +5581,13 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
         return retVal;
     }
 
+    auto &hardwareInfo = pKernel->getDevice().getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+    if (!hwHelper.isCooperativeDispatchSupported(pCommandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) {
+        retVal = CL_INVALID_COMMAND_QUEUE;
+        return retVal;
+    }
+
     size_t globalWorkSize[3];
     for (size_t i = 0; i < workDim; i++) {
         globalWorkSize[i] = workgroupCount[i] * localWorkSize[i];
@@ -5589,7 +5598,7 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
         for (size_t i = 0; i < workDim; i++) {
             requestedNumberOfWorkgroups *= workgroupCount[i];
         }
-        size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
+        size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
         if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) {
             retVal = CL_INVALID_VALUE;
             return retVal;
diff --git a/opencl/source/kernel/kernel.cpp b/opencl/source/kernel/kernel.cpp
@@ -1053,14 +1053,19 @@ void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *glob
         localWorkSize[2] = suggestedLws.z;
 }
 
-uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const {
+uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const {
     auto &hardwareInfo = getDevice().getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+
+    if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) {
+        return 0;
+    }
+
     auto executionEnvironment = kernelInfo.patchInfo.executionEnvironment;
     auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
     if (dssCount == 0) {
         dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
     }
-    auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
     auto availableThreadCount = hwHelper.calculateAvailableThreadCount(
         hardwareInfo.platform.eProductFamily,
         ((executionEnvironment != nullptr) ? executionEnvironment->NumGRFRequired : GrfConfig::DefaultGrfNumber),
diff --git a/opencl/source/kernel/kernel.h b/opencl/source/kernel/kernel.h
@@ -408,7 +408,7 @@ class Kernel : public BaseObject<_cl_kernel> {
     }
     void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
                                    size_t *localWorkSize);
-    uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const;
+    uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const;
 
     uint64_t getKernelStartOffset(
         const bool localIdsGenerationByRuntime,
diff --git a/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl b/opencl/test/unit_test/api/cl_get_kernel_max_concurrent_work_group_count_intel_tests.inl
@@ -64,15 +64,15 @@ TEST_F(clGetKernelMaxConcurrentWorkGroupCountTests, GivenVariousInputWhenGetting
     retVal = clGetKernelMaxConcurrentWorkGroupCountINTEL(pCommandQueue, pKernel, workDim, globalWorkOffset, localWorkSize,
                                                          &maxConcurrentWorkGroupCount);
     EXPECT_EQ(CL_SUCCESS, retVal);
-    size_t expectedMaxConcurrentWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
+    size_t expectedMaxConcurrentWorkGroupCount = pKernel->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
     EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
 
     std::unique_ptr<MockKernel> pKernelWithExecutionEnvironmentPatch(MockKernel::create(pCommandQueue->getDevice(), pProgram));
     retVal = clGetKernelMaxConcurrentWorkGroupCountINTEL(pCommandQueue, pKernelWithExecutionEnvironmentPatch.get(), workDim,
                                                          globalWorkOffset, localWorkSize,
                                                          &maxConcurrentWorkGroupCount);
     EXPECT_EQ(CL_SUCCESS, retVal);
-    expectedMaxConcurrentWorkGroupCount = pKernelWithExecutionEnvironmentPatch->getMaxWorkGroupCount(workDim, localWorkSize);
+    expectedMaxConcurrentWorkGroupCount = pKernelWithExecutionEnvironmentPatch->getMaxWorkGroupCount(workDim, localWorkSize, pCommandQueue);
     EXPECT_EQ(expectedMaxConcurrentWorkGroupCount, maxConcurrentWorkGroupCount);
 }
 
diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_1_tests.cpp
@@ -214,6 +214,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenAllArgsAreSetThenClEnqueueNDCountKernel
     cl_int retVal = CL_SUCCESS;
     CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
 
+    HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
+    if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+        pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
+    }
+
     std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
     EXPECT_EQ(CL_SUCCESS, retVal);
 
@@ -253,6 +258,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenNotAllArgsAreSetButSetKernelArgIsCalled
     cl_int retVal = CL_SUCCESS;
     CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
 
+    HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
+    if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+        pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
+    }
+
     std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
     EXPECT_EQ(CL_SUCCESS, retVal);
 
@@ -292,6 +302,11 @@ TEST_F(EnqueueKernelTest, givenKernelWhenSetKernelArgIsCalledForEachArgButAtLeas
     cl_int retVal = CL_SUCCESS;
     CommandQueue *pCmdQ2 = createCommandQueue(pClDevice);
 
+    HwHelper &hwHelper = HwHelper::get(pClDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
+    if (!hwHelper.isCooperativeDispatchSupported(pCmdQ2->getGpgpuEngine().getEngineType(), pClDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+        pCmdQ2->getGpgpuEngine().osContext = pCmdQ2->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
+    }
+
     std::unique_ptr<Kernel> kernel(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
     EXPECT_EQ(CL_SUCCESS, retVal);
 
diff --git a/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp b/opencl/test/unit_test/command_queue/sync_buffer_handler_tests.cpp
@@ -25,25 +25,56 @@ class MockSyncBufferHandler : public SyncBufferHandler {
     using SyncBufferHandler::usedBufferSize;
 };
 
-class SyncBufferHandlerTest : public EnqueueHandlerTest {
+class SyncBufferEnqueueHandlerTest : public EnqueueHandlerTest {
+  public:
+    void SetUp() {
+        hardwareInfo = *defaultHwInfo;
+        uint64_t hwInfoConfig = defaultHardwareInfoConfigTable[productFamily];
+        hardwareInfoSetup[productFamily](&hardwareInfo, true, hwInfoConfig);
+        SetUpImpl(&hardwareInfo);
+    }
+
+    void TearDown() {
+        context->decRefInternal();
+        delete pClDevice;
+        pClDevice = nullptr;
+        pDevice = nullptr;
+    }
+
+    void SetUpImpl(const NEO::HardwareInfo *hardwareInfo) {
+        pDevice = MockDevice::createWithNewExecutionEnvironment<MockDevice>(hardwareInfo);
+        ASSERT_NE(nullptr, pDevice);
+        pClDevice = new MockClDevice{pDevice};
+        ASSERT_NE(nullptr, pClDevice);
+
+        auto &commandStreamReceiver = pDevice->getGpgpuCommandStreamReceiver();
+        pTagMemory = commandStreamReceiver.getTagAddress();
+        ASSERT_NE(nullptr, const_cast<uint32_t *>(pTagMemory));
+
+        context = new NEO::MockContext(pClDevice);
+    }
+};
+
+class SyncBufferHandlerTest : public SyncBufferEnqueueHandlerTest {
   public:
     void SetUp() override {}
     void TearDown() override {}
 
     template <typename FamilyType>
     void SetUpT() {
-        EnqueueHandlerTest::SetUp();
+        SyncBufferEnqueueHandlerTest::SetUp();
         kernelInternals = std::make_unique<MockKernelWithInternals>(*pClDevice, context);
         kernel = kernelInternals->mockKernel;
         kernel->executionType = KernelExecutionType::Concurrent;
         commandQueue = reinterpret_cast<MockCommandQueue *>(new MockCommandQueueHw<FamilyType>(context, pClDevice, 0));
+        hwHelper = &HwHelper::get(kernel->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
     }
 
     template <typename FamilyType>
     void TearDownT() {
         commandQueue->release();
         kernelInternals.reset();
-        EnqueueHandlerTest::TearDown();
+        SyncBufferEnqueueHandlerTest::TearDown();
     }
 
     void patchAllocateSyncBuffer() {
@@ -61,6 +92,10 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest {
         return clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
     }
 
+    bool isCooperativeDispatchSupported() {
+        return hwHelper->isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), kernel->getDevice().getHardwareInfo().platform.eProductFamily);
+    }
+
     const cl_uint workDim = 1;
     const size_t gwOffset[3] = {0, 0, 0};
     const size_t lws[3] = {10, 1, 1};
@@ -71,6 +106,7 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest {
     MockKernel *kernel;
     MockCommandQueue *commandQueue;
     SPatchAllocateSyncBuffer sPatchAllocateSyncBuffer;
+    HwHelper *hwHelper;
 };
 
 HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenAllocateSyncBufferPatchAndConcurrentKernelWhenEnqueuingKernelThenSyncBufferIsUsed) {
@@ -109,7 +145,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenConcurrentKernelWithAllocateSyncB
 }
 
 HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingConcurrentKernelThenSuccessIsReturned) {
-    auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws);
+    auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue);
     workgroupCount[0] = maxWorkGroupCount;
     globalWorkSize[0] = maxWorkGroupCount * lws[0];
 
@@ -118,7 +154,7 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingCon
 }
 
 HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingConcurrentKernelThenErrorIsReturned) {
-    size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws);
+    size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws, commandQueue);
     workgroupCount[0] = maxWorkGroupCount + 1;
     globalWorkSize[0] = maxWorkGroupCount * lws[0] + 1;
 
diff --git a/opencl/test/unit_test/gtpin/gtpin_tests.cpp b/opencl/test/unit_test/gtpin/gtpin_tests.cpp
@@ -24,6 +24,7 @@
 #include "opencl/source/gtpin/gtpin_hw_helper.h"
 #include "opencl/source/gtpin/gtpin_init.h"
 #include "opencl/source/gtpin/gtpin_notify.h"
+#include "opencl/source/helpers/validators.h"
 #include "opencl/source/kernel/kernel.h"
 #include "opencl/source/mem_obj/buffer.h"
 #include "opencl/source/program/create.inl"
@@ -913,7 +914,13 @@ TEST_F(GTPinTests, givenInitializedGTPinInterfaceWhenKernelINTELIsExecutedThenGT
 
     cl_uint workDim = 1;
     size_t localWorkSize[3] = {1, 1, 1};
-    size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize);
+    CommandQueue *commandQueue = nullptr;
+    WithCastToInternal(cmdQ, &commandQueue);
+    HwHelper &hwHelper = HwHelper::get(pDevice->getDevice().getHardwareInfo().platform.eRenderCoreFamily);
+    if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), pDevice->getDevice().getHardwareInfo().platform.eProductFamily)) {
+        commandQueue->getGpgpuEngine().osContext = commandQueue->getDevice().getEngine(aub_stream::ENGINE_CCS, true, false).osContext;
+    }
+    size_t n = pKernel1->getMaxWorkGroupCount(workDim, localWorkSize, commandQueue);
     auto buff10 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr);
     auto buff11 = clCreateBuffer(context, 0, n * sizeof(unsigned int), nullptr, nullptr);
 
diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h
@@ -126,6 +126,7 @@ class HwHelper {
     virtual bool useOnlyGlobalTimestamps() const = 0;
     virtual bool useSystemMemoryPlacementForISA(const HardwareInfo &hwInfo) const = 0;
     virtual bool packedFormatsSupported() const = 0;
+    virtual bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const = 0;
 
     static uint32_t getSubDevicesCount(const HardwareInfo *pHwInfo);
     static uint32_t getEnginesCount(const HardwareInfo &hwInfo);
@@ -308,6 +309,8 @@ class HwHelperHw : public HwHelper {
 
     bool packedFormatsSupported() const override;
 
+    bool isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const override;
+
   protected:
     LocalMemoryAccessMode getDefaultLocalMemoryAccessMode(const HardwareInfo &hwInfo) const override;
 
diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl
@@ -504,4 +504,9 @@ bool MemorySynchronizationCommands<GfxFamily>::isPipeControlPriorToPipelineSelec
     return false;
 }
 
+template <typename GfxFamily>
+inline bool NEO::HwHelperHw<GfxFamily>::isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const {
+    return true;
+}
+
 } // namespace NEO

Original file line number	Diff line number	Diff line change
`@@ -408,7 +408,7 @@ class Kernel : public BaseObject<_cl_kernel> {`
`408`	`408`	`}`
`409`	`409`	`void getSuggestedLocalWorkSize(const cl_uint workDim, const size_t globalWorkSize, const size_t globalWorkOffset,`
`410`	`410`	`size_t *localWorkSize);`
`411`		`- uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize) const;`
	`411`	`+ uint32_t getMaxWorkGroupCount(const cl_uint workDim, const size_t localWorkSize, const CommandQueue commandQueue) const;`
`412`	`412`
`413`	`413`	`uint64_t getKernelStartOffset(`
`414`	`414`	`const bool localIdsGenerationByRuntime,`
Original file line number	Diff line number	Diff line change
`@@ -504,4 +504,9 @@ bool MemorySynchronizationCommands<GfxFamily>::isPipeControlPriorToPipelineSelec`
`504`	`504`	`return false;`
`505`	`505`	`}`
`506`	`506`
	`507`	`+template <typename GfxFamily>`
	`508`	`+inline bool NEO::HwHelperHw<GfxFamily>::isCooperativeDispatchSupported(const aub_stream::EngineType engine, const PRODUCT_FAMILY productFamily) const {`
	`509`	`+ return true;`
	`510`	`+}`
	`511`	`+`
`507`	`512`	`} // namespace NEO`