Skip to content

Commit 1c3d5c3

Browse files
Prepare mechanism for returning GPU execution error on OCL API
translate task count value to OCL error Related-To: NEO-7412 Signed-off-by: Mateusz Jablonski <[email protected]>
1 parent ab6af42 commit 1c3d5c3

22 files changed

+71
-53
lines changed

opencl/source/command_queue/command_queue.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ CommandQueue *CommandQueue::create(Context *context,
6060
return funcCreate(context, device, properties, internalUsage);
6161
}
6262

63+
cl_int CommandQueue::getErrorCodeFromTaskCount(uint32_t taskCount) {
64+
switch (taskCount) {
65+
case CompletionStamp::gpuHang:
66+
case CompletionStamp::outOfDeviceMemory:
67+
return CL_OUT_OF_RESOURCES;
68+
case CompletionStamp::outOfHostMemory:
69+
return CL_OUT_OF_HOST_MEMORY;
70+
default:
71+
return CL_SUCCESS;
72+
}
73+
}
74+
6375
CommandQueue::CommandQueue(Context *context, ClDevice *device, const cl_queue_properties *properties, bool internalUsage)
6476
: context(context), device(device), isInternalUsage(internalUsage) {
6577
if (context) {

opencl/source/command_queue/command_queue.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
5858
bool internalUsage,
5959
cl_int &errcodeRet);
6060

61+
static cl_int getErrorCodeFromTaskCount(uint32_t taskCount);
62+
6163
CommandQueue() = delete;
6264

6365
CommandQueue(Context *context, ClDevice *device, const cl_queue_properties *properties, bool internalUsage);

opencl/source/command_queue/enqueue_common.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -335,8 +335,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
335335
this->latestSentEnqueueType = enqueueProperties.operation;
336336
}
337337

338-
if (completionStamp.taskCount == CompletionStamp::gpuHang) {
339-
return CL_OUT_OF_RESOURCES;
338+
if (completionStamp.taskCount > CompletionStamp::notReady) {
339+
return CommandQueue::getErrorCodeFromTaskCount(completionStamp.taskCount);
340340
}
341341

342342
updateFromCompletionStamp(completionStamp, eventBuilder.getEvent());
@@ -825,14 +825,14 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueNonBlocked(
825825
if (enqueueProperties.blitPropertiesContainer->size() > 0) {
826826
auto bcsCsr = getBcsForAuxTranslation();
827827
const auto newTaskCount = bcsCsr->flushBcsTask(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
828-
if (!newTaskCount) {
828+
if (newTaskCount > CompletionStamp::notReady) {
829829
CompletionStamp completionStamp{};
830-
completionStamp.taskCount = CompletionStamp::gpuHang;
830+
completionStamp.taskCount = newTaskCount;
831831

832832
return completionStamp;
833833
}
834834

835-
this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), *newTaskCount);
835+
this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
836836
dispatchFlags.implicitFlush = true;
837837
}
838838

@@ -1063,14 +1063,14 @@ CompletionStamp CommandQueueHw<GfxFamily>::enqueueCommandWithoutKernel(
10631063
if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) {
10641064
UNRECOVERABLE_IF(!enqueueProperties.blitPropertiesContainer);
10651065
const auto newTaskCount = bcsCsr->flushBcsTask(*enqueueProperties.blitPropertiesContainer, false, this->isProfilingEnabled(), getDevice());
1066-
if (!newTaskCount) {
1066+
if (newTaskCount > CompletionStamp::notReady) {
10671067
CompletionStamp completionStamp{};
1068-
completionStamp.taskCount = CompletionStamp::gpuHang;
1068+
completionStamp.taskCount = newTaskCount;
10691069

10701070
return completionStamp;
10711071
}
10721072

1073-
this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), *newTaskCount);
1073+
this->updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
10741074
}
10751075

10761076
return completionStamp;
@@ -1276,8 +1276,8 @@ cl_int CommandQueueHw<GfxFamily>::enqueueBlit(const MultiDispatchInfo &multiDisp
12761276
completionStamp = enqueueCommandWithoutKernel(nullptr, 0, gpgpuCommandStream, gpgpuCommandStreamStart, blocking,
12771277
enqueueProperties, timestampPacketDependencies, eventsRequest,
12781278
eventBuilder, taskLevel, csrDeps, &bcsCsr);
1279-
if (completionStamp.taskCount == CompletionStamp::gpuHang) {
1280-
return CL_OUT_OF_RESOURCES;
1279+
if (completionStamp.taskCount > CompletionStamp::notReady) {
1280+
return CommandQueue::getErrorCodeFromTaskCount(completionStamp.taskCount);
12811281
}
12821282

12831283
if (gpgpuSubmission) {

opencl/source/event/event.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,7 @@ void Event::submitCommand(bool abortTasks) {
597597
setEndTimeStamp();
598598
}
599599

600-
if (complStamp.taskCount == CompletionStamp::gpuHang) {
600+
if (complStamp.taskCount > CompletionStamp::notReady) {
601601
abortExecutionDueToGpuHang();
602602
return;
603603
}

opencl/source/helpers/task_information.cpp

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -258,14 +258,12 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
258258
commandQueue.clearLastBcsPackets();
259259
}
260260

261-
bool isGpuHangDetected{false};
262-
263261
if (kernelOperation->blitPropertiesContainer.size() > 0) {
264262
const auto newTaskCount = bcsCsrForAuxTranslation->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
265-
if (newTaskCount) {
266-
commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), *newTaskCount);
263+
if (newTaskCount <= CompletionStamp::notReady) {
264+
commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), newTaskCount);
267265
} else {
268-
isGpuHangDetected = true;
266+
completionStamp.taskCount = newTaskCount;
269267
}
270268
}
271269
commandQueue.updateLatestSentEnqueueType(EnqueueProperties::Operation::GpuKernel);
@@ -277,11 +275,11 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
277275
if (printfHandler) {
278276
const auto waitStatus = commandQueue.waitUntilComplete(completionStamp.taskCount, {}, completionStamp.flushStamp, false);
279277
if (waitStatus == WaitStatus::GpuHang) {
280-
isGpuHangDetected = true;
278+
completionStamp.taskCount = CompletionStamp::gpuHang;
281279
}
282280

283281
if (!printfHandler->printEnqueueOutput()) {
284-
isGpuHangDetected = true;
282+
completionStamp.taskCount = CompletionStamp::gpuHang;
285283
}
286284
}
287285

@@ -290,14 +288,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate
290288
}
291289
surfaces.clear();
292290

293-
if (isGpuHangDetected) {
294-
completionStamp.taskCount = CompletionStamp::gpuHang;
295-
}
296-
297291
return completionStamp;
298292
}
299293

300-
bool CommandWithoutKernel::dispatchBlitOperation() {
294+
uint32_t CommandWithoutKernel::dispatchBlitOperation() {
301295
auto bcsCsr = kernelOperation->bcsCsr;
302296
UNRECOVERABLE_IF(bcsCsr == nullptr);
303297

@@ -314,14 +308,14 @@ bool CommandWithoutKernel::dispatchBlitOperation() {
314308
}
315309

316310
const auto newTaskCount = bcsCsr->flushBcsTask(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice());
317-
if (!newTaskCount) {
318-
return false;
311+
if (newTaskCount > CompletionStamp::notReady) {
312+
return newTaskCount;
319313
}
320314

321-
commandQueue.updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), *newTaskCount);
315+
commandQueue.updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount);
322316
commandQueue.setLastBcsPacket(bcsCsr->getOsContext().getEngineType());
323317

324-
return true;
318+
return newTaskCount;
325319
}
326320

327321
CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminated) {
@@ -420,8 +414,9 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate
420414
}
421415

422416
if (kernelOperation->blitEnqueue) {
423-
if (!dispatchBlitOperation()) {
424-
completionStamp.taskCount = CompletionStamp::gpuHang;
417+
auto taskCount = dispatchBlitOperation();
418+
if (taskCount > CompletionStamp::notReady) {
419+
completionStamp.taskCount = taskCount;
425420
}
426421
}
427422

opencl/source/helpers/task_information.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,6 @@ class CommandWithoutKernel : public Command {
156156
public:
157157
using Command::Command;
158158
CompletionStamp &submit(uint32_t taskLevel, bool terminated) override;
159-
bool dispatchBlitOperation();
159+
uint32_t dispatchBlitOperation();
160160
};
161161
} // namespace NEO

opencl/source/program/printf_handler.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ bool PrintfHandler::printEnqueueOutput() {
9898
0, 0, 0, Vec3<size_t>(printfOutputSize, 0, 0), 0, 0, 0, 0));
9999

100100
const auto newTaskCount = bcsEngine.commandStreamReceiver->flushBcsTask(blitPropertiesContainer, true, false, device);
101-
if (!newTaskCount) {
101+
if (newTaskCount > CompletionStamp::notReady) {
102102
return false;
103103
}
104104
}

opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenGpuHangOnFlushBcsAndBlitAuxTran
113113

114114
auto ultBcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(bcsCsr);
115115
ultBcsCsr->callBaseFlushBcsTask = false;
116-
ultBcsCsr->flushBcsTaskReturnValue = std::nullopt;
116+
ultBcsCsr->flushBcsTaskReturnValue = CompletionStamp::gpuHang;
117117

118118
auto mockCmdQ = static_cast<MockCommandQueueHw<FamilyType> *>(commandQueue.get());
119119

@@ -666,7 +666,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenGpuHangOnFlushBcsTaskAndBlitTra
666666

667667
auto ultBcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(bcsCsr);
668668
ultBcsCsr->callBaseFlushBcsTask = false;
669-
ultBcsCsr->flushBcsTaskReturnValue = std::nullopt;
669+
ultBcsCsr->flushBcsTaskReturnValue = CompletionStamp::gpuHang;
670670

671671
UserEvent userEvent;
672672
cl_event waitlist[] = {&userEvent};
@@ -943,7 +943,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenDebugFlagSetWhenDis
943943
HWTEST_TEMPLATED_F(BlitEnqueueWithDebugCapabilityTests, givenGpuHangOnFlushBcsTaskAndDebugFlagSetWhenDispatchingBlitEnqueueThenOutOfResourcesIsReturned) {
944944
auto ultBcsCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(bcsCsr);
945945
ultBcsCsr->callBaseFlushBcsTask = false;
946-
ultBcsCsr->flushBcsTaskReturnValue = std::nullopt;
946+
ultBcsCsr->flushBcsTaskReturnValue = CompletionStamp::gpuHang;
947947

948948
buffer = createBuffer(1, false);
949949
buffer->forceDisallowCPUCopy = true;
@@ -1194,7 +1194,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueFlushTests, givenGpuHangOnFlushBcsTaskAndBlockedQu
11941194
auto myUltBcsCsr = static_cast<MyUltCsr<FamilyType> *>(bcsCsr);
11951195
myUltBcsCsr->flushCounter = &flushCounter;
11961196
myUltBcsCsr->callBaseFlushBcsTask = false;
1197-
myUltBcsCsr->flushBcsTaskReturnValue = std::nullopt;
1197+
myUltBcsCsr->flushBcsTaskReturnValue = CompletionStamp::gpuHang;
11981198

11991199
UserEvent userEvent;
12001200
cl_event waitlist[] = {&userEvent};

opencl/test/unit_test/command_queue/command_queue_tests.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,13 @@ INSTANTIATE_TEST_CASE_P(CommandQueue,
128128
CommandQueueTest,
129129
::testing::ValuesIn(AllCommandQueueProperties));
130130

131+
TEST(CommandQueue, WhenGettingErrorCodeFromTaskCountThenProperValueIsReturned) {
132+
EXPECT_EQ(CL_SUCCESS, CommandQueue::getErrorCodeFromTaskCount(0));
133+
EXPECT_EQ(CL_OUT_OF_HOST_MEMORY, CommandQueue::getErrorCodeFromTaskCount(CompletionStamp::outOfHostMemory));
134+
EXPECT_EQ(CL_OUT_OF_RESOURCES, CommandQueue::getErrorCodeFromTaskCount(CompletionStamp::outOfDeviceMemory));
135+
EXPECT_EQ(CL_OUT_OF_RESOURCES, CommandQueue::getErrorCodeFromTaskCount(CompletionStamp::gpuHang));
136+
}
137+
131138
TEST(CommandQueue, WhenConstructingCommandQueueThenTaskLevelAndTaskCountAreZero) {
132139
MockCommandQueue cmdQ(nullptr, nullptr, 0, false);
133140
EXPECT_EQ(0u, cmdQ.taskLevel);

opencl/test/unit_test/command_stream/command_stream_receiver_hw_2_tests.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -650,12 +650,12 @@ HWTEST_F(BcsTests, GivenNoneGpuHangWhenBlitFromHostPtrCalledThenCallWaitWithKmdF
650650
0, 0, {1, 1, 1}, 0, 0, 0, 0);
651651

652652
const auto taskCount1 = flushBcsTask(myMockCsr.get(), blitProperties, false, *pDevice);
653-
EXPECT_TRUE(taskCount1.has_value());
653+
EXPECT_NE(CompletionStamp::gpuHang, taskCount1);
654654

655655
EXPECT_EQ(0u, myMockCsr->waitForTaskCountWithKmdNotifyFallbackCalled);
656656

657657
const auto taskCount2 = flushBcsTask(myMockCsr.get(), blitProperties, true, *pDevice);
658-
EXPECT_TRUE(taskCount2.has_value());
658+
EXPECT_NE(CompletionStamp::gpuHang, taskCount2);
659659

660660
EXPECT_EQ(1u, myMockCsr->waitForTaskCountWithKmdNotifyFallbackCalled);
661661
EXPECT_EQ(myMockCsr->taskCount, myMockCsr->taskCountToWaitPassed);
@@ -687,14 +687,14 @@ HWTEST_F(BcsTests, GivenGpuHangWhenBlitFromHostPtrCalledThenCallWaitWithKmdFallb
687687
0, 0, {1, 1, 1}, 0, 0, 0, 0);
688688

689689
const auto taskCount1 = flushBcsTask(myMockCsr.get(), blitProperties, false, *pDevice);
690-
EXPECT_TRUE(taskCount1.has_value());
690+
EXPECT_NE(CompletionStamp::gpuHang, taskCount1);
691691

692692
EXPECT_EQ(0u, myMockCsr->waitForTaskCountWithKmdNotifyFallbackCalled);
693693

694694
myMockCsr->waitForTaskCountWithKmdNotifyFallbackReturnValue = WaitStatus::GpuHang;
695695

696696
const auto taskCount2 = flushBcsTask(myMockCsr.get(), blitProperties, true, *pDevice);
697-
EXPECT_FALSE(taskCount2.has_value());
697+
EXPECT_EQ(CompletionStamp::gpuHang, taskCount2);
698698

699699
EXPECT_EQ(1u, myMockCsr->waitForTaskCountWithKmdNotifyFallbackCalled);
700700
EXPECT_EQ(myMockCsr->taskCount, myMockCsr->taskCountToWaitPassed);

0 commit comments

Comments
 (0)