Skip to content

Commit c04f8e5

Browse files
Pass copy engines to waitUntilComplete in OpenCL command queue
Related-To: NEO-6057 Signed-off-by: Maciej Dziuban <[email protected]>
1 parent 9bb1ef4 commit c04f8e5

File tree

14 files changed

+48
-36
lines changed

14 files changed

+48
-36
lines changed

opencl/source/command_queue/command_queue.cpp

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ bool CommandQueue::isCompleted(uint32_t gpgpuTaskCount, CopyEngineState bcsState
188188
return false;
189189
}
190190

191-
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
191+
void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) {
192192
WAIT_ENTER()
193193

194194
DBG_LOG(LogTaskCounts, __FUNCTION__, "Waiting for taskCount:", gpgpuTaskCountToWait);
@@ -206,10 +206,10 @@ void CommandQueue::waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcs
206206
gtpinNotifyTaskCompletion(gpgpuTaskCountToWait);
207207
}
208208

209-
if (bcsEngine) {
210-
auto bcsCsr = getBcsCommandStreamReceiver(bcsEngine->getEngineType());
211-
bcsCsr->waitForTaskCountWithKmdNotifyFallback(bcsTaskCountToWait, 0, false, false);
212-
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(bcsTaskCountToWait);
209+
for (const CopyEngineState &copyEngine : copyEnginesToWait) {
210+
auto bcsCsr = getBcsCommandStreamReceiver(copyEngine.engineType);
211+
bcsCsr->waitForTaskCountWithKmdNotifyFallback(copyEngine.taskCount, 0, false, false);
212+
bcsCsr->waitForTaskCountAndCleanTemporaryAllocationList(copyEngine.taskCount);
213213
}
214214

215215
getGpgpuCommandStreamReceiver().waitForTaskCountAndCleanTemporaryAllocationList(gpgpuTaskCountToWait);
@@ -919,7 +919,8 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan
919919
deferredTimestampPackets->swapNodes(nodesToRelease);
920920
}
921921

922-
waitUntilComplete(taskCount, this->bcsState.taskCount, flushStamp->peekStamp(), false);
922+
Range<CopyEngineState> states{&bcsState, bcsState.isValid() ? 1u : 0u};
923+
waitUntilComplete(taskCount, states, flushStamp->peekStamp(), false);
923924

924925
if (printfHandler) {
925926
printfHandler->printEnqueueOutput();

opencl/source/command_queue/command_queue.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
#pragma once
99
#include "shared/source/helpers/engine_control.h"
10+
#include "shared/source/utilities/range.h"
1011

1112
#include "opencl/source/command_queue/copy_engine_state.h"
1213
#include "opencl/source/command_queue/csr_selection_args.h"
@@ -216,7 +217,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
216217

217218
MOCKABLE_VIRTUAL bool isQueueBlocked();
218219

219-
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);
220+
MOCKABLE_VIRTUAL void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep);
220221
MOCKABLE_VIRTUAL void waitForAllEngines(bool blockedQueue, PrintfHandler *printfHandler);
221222

222223
static uint32_t getTaskLevelFromWaitList(uint32_t taskLevel,

opencl/source/command_queue/enqueue_common.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
331331
getGpgpuCommandStreamReceiver().setMediaVFEStateDirty(true);
332332

333333
if (devQueueHw->getSchedulerReturnInstance() > 0) {
334-
waitUntilComplete(completionStamp.taskCount, this->bcsState.taskCount, completionStamp.flushStamp, false);
334+
waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
335335
this->runSchedulerSimulation(*devQueueHw, *parentKernel);
336336
}
337337
}

opencl/source/event/event.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,8 @@ inline bool Event::wait(bool blocking, bool useQuickKmdSleep) {
408408
}
409409
}
410410

411-
cmdQueue->waitUntilComplete(taskCount.load(), this->bcsState.taskCount, flushStamp->peekStamp(), useQuickKmdSleep);
411+
Range<CopyEngineState> states{&bcsState, bcsState.isValid() ? 1u : 0u};
412+
cmdQueue->waitUntilComplete(taskCount.load(), states, flushStamp->peekStamp(), useQuickKmdSleep);
412413
updateExecutionStatus();
413414

414415
DEBUG_BREAK_IF(this->taskLevel == CompletionStamp::notReady && this->executionStatus >= 0);

opencl/source/helpers/task_information.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ CompletionStamp &CommandMapUnmap::submit(uint32_t taskLevel, bool terminated) {
9797
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::DependencyResolveOnGpu);
9898

9999
if (!memObj.isMemObjZeroCopy()) {
100-
commandQueue.waitUntilComplete(completionStamp.taskCount, 0u, completionStamp.flushStamp, false);
100+
commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
101101
if (operationType == MAP) {
102102
memObj.transferDataToHostPtr(copySize, copyOffset);
103103
} else if (!readOnly) {
@@ -287,10 +287,9 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
287287
dispatchFlags,
288288
commandQueue.getDevice());
289289

290-
uint32_t bcsTaskCount = 0u;
291290
if (kernelOperation->blitPropertiesContainer.size() > 0) {
292-
bcsTaskCount = bcsCsrForAuxTranslation->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
293-
commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), bcsTaskCount);
291+
const auto newTaskCount = bcsCsrForAuxTranslation->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
292+
commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), newTaskCount);
294293
}
295294
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::GpuKernel);
296295

@@ -299,7 +298,7 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
299298
}
300299

301300
if (printfHandler) {
302-
commandQueue.waitUntilComplete(completionStamp.taskCount, bcsTaskCount, completionStamp.flushStamp, false);
301+
commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
303302
printfHandler.get()->printEnqueueOutput();
304303
}
305304

opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1275,7 +1275,8 @@ HWTEST_TEMPLATED_F(BlitEnqueueTaskCountTests, whenWaitUntilCompletionCalledThenW
12751275
uint32_t gpgpuTaskCount = 123;
12761276
uint32_t bcsTaskCount = 123;
12771277

1278-
commandQueue->waitUntilComplete(gpgpuTaskCount, bcsTaskCount, 0, false);
1278+
CopyEngineState bcsState{bcsCsr->getOsContext().getEngineType(), bcsTaskCount};
1279+
commandQueue->waitUntilComplete(gpgpuTaskCount, Range{&bcsState}, 0, false);
12791280

12801281
EXPECT_EQ(gpgpuTaskCount, static_cast<UltCommandStreamReceiver<FamilyType> *>(gpgpuCsr)->latestWaitForCompletionWithTimeoutTaskCount.load());
12811282
EXPECT_EQ(bcsTaskCount, static_cast<UltCommandStreamReceiver<FamilyType> *>(bcsCsr)->latestWaitForCompletionWithTimeoutTaskCount.load());

opencl/test/unit_test/command_queue/command_queue_tests.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -807,7 +807,7 @@ struct WaitForQueueCompletionTests : public ::testing::Test {
807807
template <typename Family>
808808
struct MyCmdQueue : public CommandQueueHw<Family> {
809809
MyCmdQueue(Context *context, ClDevice *device) : CommandQueueHw<Family>(context, device, nullptr, false){};
810-
void waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override {
810+
void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override {
811811
requestedUseQuickKmdSleep = useQuickKmdSleep;
812812
waitUntilCompleteCounter++;
813813
}

opencl/test/unit_test/command_queue/enqueue_handler_tests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,9 @@ struct EnqueueHandlerWithAubSubCaptureTests : public EnqueueHandlerTest {
109109
public:
110110
MockCmdQWithAubSubCapture(Context *context, ClDevice *device) : CommandQueueHw<FamilyType>(context, device, nullptr, false) {}
111111

112-
void waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override {
112+
void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override {
113113
waitUntilCompleteCalled = true;
114-
CommandQueueHw<FamilyType>::waitUntilComplete(gpgpuTaskCountToWait, bcsTaskCountToWait, flushStampToWait, useQuickKmdSleep);
114+
CommandQueueHw<FamilyType>::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep);
115115
}
116116

117117
void obtainNewTimestampPacketNodes(size_t numberOfNodes, TimestampPacketContainer &previousNodes, bool clearAllDependencies, CommandStreamReceiver &csr) override {

opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -801,9 +801,9 @@ struct EnqueueAuxKernelTests : public EnqueueKernelTest {
801801
auxTranslationDirection);
802802
}
803803

804-
void waitUntilComplete(uint32_t gpgpuTaskCountToWait, uint32_t bcsTaskCountToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override {
804+
void waitUntilComplete(uint32_t gpgpuTaskCountToWait, Range<CopyEngineState> copyEnginesToWait, FlushStamp flushStampToWait, bool useQuickKmdSleep) override {
805805
waitCalled++;
806-
CommandQueueHw<FamilyType>::waitUntilComplete(gpgpuTaskCountToWait, bcsTaskCountToWait, flushStampToWait, useQuickKmdSleep);
806+
CommandQueueHw<FamilyType>::waitUntilComplete(gpgpuTaskCountToWait, copyEnginesToWait, flushStampToWait, useQuickKmdSleep);
807807
}
808808

809809
std::vector<AuxTranslationDirection> auxTranslationDirections;

opencl/test/unit_test/command_queue/enqueue_read_image_tests.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -762,13 +762,13 @@ HWTEST_F(EnqueueReadImageTest, GivenImage1DThatIsZeroCopyWhenReadImageWithTheSam
762762

763763
HWTEST_F(EnqueueReadImageTest, givenDeviceWithBlitterSupportWhenEnqueueReadImageThenBlitEnqueueImageAllowedReturnsCorrectResult) {
764764
DebugManagerStateRestore restorer;
765-
DebugManager.flags.OverrideInvalidEngineWithDefault.set(1);
766765
DebugManager.flags.EnableBlitterForEnqueueOperations.set(1);
767766
DebugManager.flags.EnableBlitterForEnqueueImageOperations.set(1);
768767

769768
auto hwInfo = pClDevice->getRootDeviceEnvironment().getMutableHardwareInfo();
770769
auto &hwHelper = HwHelper::get(hwInfo->platform.eRenderCoreFamily);
771770
hwInfo->capabilityTable.blitterOperationsSupported = true;
771+
REQUIRE_BLITTER_OR_SKIP(hwInfo);
772772
size_t origin[] = {0, 0, 0};
773773
auto mockCmdQ = std::make_unique<MockCommandQueueHw<FamilyType>>(context, pClDevice, nullptr);
774774
std::unique_ptr<Image> image(Image2dHelper<>::create(context));

0 commit comments

Comments
 (0)