Skip to content

Commit 835b344

Browse files
Add debug flag to disable GPU hang detection
This change introduces the new flag called DisableGpuHangDetection. By default it is disabled. When someone wants to disable hang checking, then this flag can be set to true. Related-To: NEO-6681 Signed-off-by: Patryk Wrobel <[email protected]>
1 parent 9d4daca commit 835b344

File tree

4 files changed

+23
-0
lines changed

4 files changed

+23
-0
lines changed

opencl/test/unit_test/test_files/igdrcl.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,7 @@ ReuseKernelBinaries = -1
382382
EnableChipsetUniqueUUID = -1
383383
ForceSimdMessageSizeInWalker = -1
384384
UseNewQueryTopoIoctl = 1
385+
DisableGpuHangDetection = 0
385386
EnableRecoverablePageFaults = -1
386387
EnableImplicitMigrationOnFaultableHardware = -1
387388
UseDrmVirtualEnginesForCcs = -1

shared/source/command_stream/command_stream_receiver.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "shared/source/command_stream/experimental_command_buffer.h"
1313
#include "shared/source/command_stream/preemption.h"
1414
#include "shared/source/command_stream/scratch_space_controller.h"
15+
#include "shared/source/debug_settings/debug_settings_manager.h"
1516
#include "shared/source/device/device.h"
1617
#include "shared/source/direct_submission/direct_submission_controller.h"
1718
#include "shared/source/execution_environment/root_device_environment.h"
@@ -249,6 +250,10 @@ bool CommandStreamReceiver::skipResourceCleanup() const {
249250
}
250251

251252
bool CommandStreamReceiver::isGpuHangDetected() const {
253+
if (DebugManager.flags.DisableGpuHangDetection.get()) {
254+
return false;
255+
}
256+
252257
return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext);
253258
}
254259

shared/source/debug_settings/debug_variables_base.inl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ DECLARE_DEBUG_VARIABLE(bool, AllowPatchingVfeStateInCommandLists, false, "true:
7373
DECLARE_DEBUG_VARIABLE(bool, PrintMemoryRegionSizes, false, "print memory bank type, instance and it's size")
7474
DECLARE_DEBUG_VARIABLE(bool, UpdateCrossThreadDataSize, false, "Turn on cross thread data size calculation for PATCH TOKEN binary")
7575
DECLARE_DEBUG_VARIABLE(bool, UseNewQueryTopoIoctl, true, "Use DRM_I915_QUERY_COMPUTE_SLICES")
76+
DECLARE_DEBUG_VARIABLE(bool, DisableGpuHangDetection, false, "Disable GPU hang detection")
7677
DECLARE_DEBUG_VARIABLE(std::string, ForceDeviceId, std::string("unk"), "DeviceId selected for testing")
7778
DECLARE_DEBUG_VARIABLE(std::string, FilterDeviceId, std::string("unk"), "Device id filter, adapter matching device id will be opened. Ignored when unk.")
7879
DECLARE_DEBUG_VARIABLE(std::string, FilterBdfPath, std::string("unk"), "Linux-only, BDF path filter, only matching paths will be opened. Ignored when unk.")

shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,22 @@ HWTEST_F(CommandStreamReceiverTest, whenStoreAllocationThenStoredAllocationHasTa
174174
EXPECT_EQ(csr.peekTaskCount(), allocation->getTaskCount(csr.getOsContext().getContextId()));
175175
}
176176

177+
HWTEST_F(CommandStreamReceiverTest, givenDisableGpuHangDetectionFlagWhenCheckingGpuHangThenDriverModelIsNotCalledAndFalseIsReturned) {
178+
DebugManagerStateRestore stateRestore;
179+
DebugManager.flags.DisableGpuHangDetection.set(true);
180+
181+
auto driverModelMock = std::make_unique<MockDriverModel>();
182+
driverModelMock->isGpuHangDetectedToReturn = true;
183+
184+
auto osInterface = std::make_unique<OSInterface>();
185+
osInterface->setDriverModel(std::move(driverModelMock));
186+
187+
auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
188+
csr.executionEnvironment.rootDeviceEnvironments[csr.rootDeviceIndex]->osInterface = std::move(osInterface);
189+
190+
EXPECT_FALSE(csr.isGpuHangDetected());
191+
}
192+
177193
HWTEST_F(CommandStreamReceiverTest, givenGpuHangWhenWaititingForCompletionWithTimeoutThenGpuHangIsReturned) {
178194
auto driverModelMock = std::make_unique<MockDriverModel>();
179195
driverModelMock->isGpuHangDetectedToReturn = true;

0 commit comments

Comments
 (0)