Skip to content

Commit 18cafd3

Browse files
Implement GPU hang detection on Windows
This change uses value of cpuAddress from monitored fence to detect GPU hang. Related-To: NEO-5313 Signed-off-by: Patryk Wrobel <[email protected]>
1 parent 61ca84e commit 18cafd3

File tree

12 files changed

+69
-86
lines changed

12 files changed

+69
-86
lines changed

level_zero/core/test/unit_tests/fixtures/device_fixture.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,10 @@ struct MockDriverModel : NEO::DriverModel {
3838
size_t getMaxMemAllocSize() const override {
3939
return maxAllocSize;
4040
}
41+
42+
bool isGpuHangDetected(NEO::OsContext &osContext) override {
43+
return false;
44+
}
4145
};
4246

4347
struct MockDriverModelWDDM : NEO::DriverModel {
@@ -51,6 +55,10 @@ struct MockDriverModelWDDM : NEO::DriverModel {
5155
size_t getMaxMemAllocSize() const override {
5256
return maxAllocSize;
5357
}
58+
59+
bool isGpuHangDetected(NEO::OsContext &osContext) override {
60+
return false;
61+
}
5462
};
5563

5664
struct MockDriverModelDRM : NEO::DriverModel {
@@ -64,6 +72,10 @@ struct MockDriverModelDRM : NEO::DriverModel {
6472
size_t getMaxMemAllocSize() const override {
6573
return maxAllocSize;
6674
}
75+
76+
bool isGpuHangDetected(NEO::OsContext &osContext) override {
77+
return false;
78+
}
6779
};
6880

6981
struct ContextShareableMock : public L0::ContextImp {

level_zero/tools/test/unit_tests/sources/sysman/linux/test_sysman.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (C) 2020-2021 Intel Corporation
2+
* Copyright (C) 2020-2022 Intel Corporation
33
*
44
* SPDX-License-Identifier: MIT
55
*
@@ -366,6 +366,10 @@ class UnknownDriverModel : public DriverModel {
366366
PhysicalDevicePciBusInfo pciBusInfo(PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue, PhysicalDevicePciBusInfo::InvalidValue);
367367
return pciBusInfo;
368368
}
369+
370+
bool isGpuHangDetected(OsContext &osContext) override {
371+
return false;
372+
}
369373
};
370374

371375
using SysmanUnknownDriverModelTest = Test<DeviceFixture>;

opencl/test/unit_test/os_interface/linux/drm_tests.cpp

Lines changed: 9 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1038,30 +1038,6 @@ TEST(DrmTest, GivenCompletionFenceDebugFlagWhenCreatingDrmObjectThenExpectCorrec
10381038
EXPECT_FALSE(drmDisabled.completionFenceSupport());
10391039
}
10401040

1041-
TEST(DrmTest, GivenInvalidContextIdWhenIsGpuHangIsCalledThenErrorIsThrown) {
1042-
ExecutionEnvironment executionEnvironment{};
1043-
executionEnvironment.prepareRootDeviceEnvironments(1);
1044-
1045-
DrmMock drm{*executionEnvironment.rootDeviceEnvironments[0]};
1046-
uint32_t contextId{0};
1047-
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
1048-
1049-
CommandStreamReceiver *csr{nullptr};
1050-
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
1051-
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
1052-
1053-
auto memoryManager = std::make_unique<MockMemoryManager>();
1054-
auto memoryManagerRaw = memoryManager.get();
1055-
1056-
memoryManagerRaw->registeredEngines = std::move(engines);
1057-
executionEnvironment.memoryManager = std::move(memoryManager);
1058-
1059-
const auto invalidContextId = 1;
1060-
EXPECT_THROW(drm.isGpuHangDetected(invalidContextId), std::runtime_error);
1061-
1062-
memoryManagerRaw->registeredEngines.clear();
1063-
}
1064-
10651041
TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
10661042
ExecutionEnvironment executionEnvironment{};
10671043
executionEnvironment.prepareRootDeviceEnvironments(1);
@@ -1070,22 +1046,11 @@ TEST(DrmTest, GivenIoctlErrorWhenIsGpuHangIsCalledThenErrorIsThrown) {
10701046
uint32_t contextId{0};
10711047
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
10721048

1073-
CommandStreamReceiver *csr{nullptr};
10741049
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
1075-
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
1076-
1077-
auto memoryManager = std::make_unique<MockMemoryManager>();
1078-
auto memoryManagerRaw = memoryManager.get();
1079-
1080-
memoryManagerRaw->registeredEngines = std::move(engines);
1081-
executionEnvironment.memoryManager = std::move(memoryManager);
1082-
10831050
mockOsContextLinux.drmContextIds.push_back(0);
10841051
mockOsContextLinux.drmContextIds.push_back(3);
10851052

1086-
EXPECT_THROW(drm.isGpuHangDetected(0), std::runtime_error);
1087-
1088-
memoryManagerRaw->registeredEngines.clear();
1053+
EXPECT_THROW(drm.isGpuHangDetected(mockOsContextLinux), std::runtime_error);
10891054
}
10901055

10911056
TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCalledThenNoHangIsReported) {
@@ -1096,30 +1061,20 @@ TEST(DrmTest, GivenZeroBatchActiveAndZeroBatchPendingResetStatsWhenIsGpuHangIsCa
10961061
uint32_t contextId{0};
10971062
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
10981063

1099-
CommandStreamReceiver *csr{nullptr};
11001064
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
1101-
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
1102-
1103-
auto memoryManager = std::make_unique<MockMemoryManager>();
1104-
auto memoryManagerRaw = memoryManager.get();
1105-
1106-
memoryManagerRaw->registeredEngines = std::move(engines);
1107-
executionEnvironment.memoryManager = std::move(memoryManager);
1065+
mockOsContextLinux.drmContextIds.push_back(0);
1066+
mockOsContextLinux.drmContextIds.push_back(3);
11081067

11091068
drm_i915_reset_stats resetStats{};
11101069
resetStats.ctx_id = 0;
1111-
mockOsContextLinux.drmContextIds.push_back(0);
11121070
drm.resetStatsToReturn.push_back(resetStats);
11131071

11141072
resetStats.ctx_id = 3;
1115-
mockOsContextLinux.drmContextIds.push_back(3);
11161073
drm.resetStatsToReturn.push_back(resetStats);
11171074

11181075
bool isGpuHangDetected{};
1119-
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
1076+
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
11201077
EXPECT_FALSE(isGpuHangDetected);
1121-
1122-
memoryManagerRaw->registeredEngines.clear();
11231078
}
11241079

11251080
TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
@@ -1130,31 +1085,21 @@ TEST(DrmTest, GivenBatchActiveGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThen
11301085
uint32_t contextId{0};
11311086
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
11321087

1133-
CommandStreamReceiver *csr{nullptr};
11341088
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
1135-
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
1136-
1137-
auto memoryManager = std::make_unique<MockMemoryManager>();
1138-
auto memoryManagerRaw = memoryManager.get();
1139-
1140-
memoryManagerRaw->registeredEngines = std::move(engines);
1141-
executionEnvironment.memoryManager = std::move(memoryManager);
1089+
mockOsContextLinux.drmContextIds.push_back(0);
1090+
mockOsContextLinux.drmContextIds.push_back(3);
11421091

11431092
drm_i915_reset_stats resetStats{};
11441093
resetStats.ctx_id = 0;
1145-
mockOsContextLinux.drmContextIds.push_back(0);
11461094
drm.resetStatsToReturn.push_back(resetStats);
11471095

11481096
resetStats.ctx_id = 3;
11491097
resetStats.batch_active = 2;
1150-
mockOsContextLinux.drmContextIds.push_back(3);
11511098
drm.resetStatsToReturn.push_back(resetStats);
11521099

11531100
bool isGpuHangDetected{};
1154-
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
1101+
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
11551102
EXPECT_TRUE(isGpuHangDetected);
1156-
1157-
memoryManagerRaw->registeredEngines.clear();
11581103
}
11591104

11601105
TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThenHangIsReported) {
@@ -1165,27 +1110,17 @@ TEST(DrmTest, GivenBatchPendingGreaterThanZeroResetStatsWhenIsGpuHangIsCalledThe
11651110
uint32_t contextId{0};
11661111
EngineDescriptor engineDescriptor{EngineDescriptorHelper::getDefaultDescriptor({aub_stream::ENGINE_BCS, EngineUsage::Regular})};
11671112

1168-
CommandStreamReceiver *csr{nullptr};
11691113
MockOsContextLinux mockOsContextLinux{drm, contextId, engineDescriptor};
1170-
EngineControlContainer engines{EngineControl{csr, &mockOsContextLinux}};
1171-
1172-
auto memoryManager = std::make_unique<MockMemoryManager>();
1173-
auto memoryManagerRaw = memoryManager.get();
1174-
1175-
memoryManagerRaw->registeredEngines = std::move(engines);
1176-
executionEnvironment.memoryManager = std::move(memoryManager);
1114+
mockOsContextLinux.drmContextIds.push_back(8);
11771115

11781116
drm_i915_reset_stats resetStats{};
11791117
resetStats.ctx_id = 8;
11801118
resetStats.batch_pending = 7;
1181-
mockOsContextLinux.drmContextIds.push_back(8);
11821119
drm.resetStatsToReturn.push_back(resetStats);
11831120

11841121
bool isGpuHangDetected{};
1185-
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(0));
1122+
EXPECT_NO_THROW(isGpuHangDetected = drm.isGpuHangDetected(mockOsContextLinux));
11861123
EXPECT_TRUE(isGpuHangDetected);
1187-
1188-
memoryManagerRaw->registeredEngines.clear();
11891124
}
11901125

11911126
TEST(DrmTest, givenSetupIoctlHelperThenIoctlHelperNotNull) {

opencl/test/unit_test/os_interface/windows/wddm20_tests.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,9 @@
3838
#include "gtest/gtest.h"
3939
#include "mock_gmm_memory.h"
4040

41+
#include <cstdint>
4142
#include <functional>
43+
#include <limits>
4244
#include <memory>
4345

4446
namespace NEO {
@@ -344,6 +346,24 @@ TEST_F(Wddm20Tests, givenGraphicsAllocationWhenItIsMappedInHeap0ThenItHasGpuAddr
344346
EXPECT_LE(gpuAddress, cannonizedHeapEnd);
345347
}
346348

349+
TEST_F(Wddm20WithMockGdiDllTests, GivenInvalidCpuAddressWhenCheckingForGpuHangThenFalseIsReturned) {
350+
osContext->getResidencyController().getMonitoredFence().cpuAddress = nullptr;
351+
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
352+
}
353+
354+
TEST_F(Wddm20WithMockGdiDllTests, GivenCpuValueDifferentThanGpuHangIndicationWhenCheckingForGpuHangThenFalseIsReturned) {
355+
constexpr auto cpuValue{777u};
356+
ASSERT_NE(NEO::Wddm::gpuHangIndication, cpuValue);
357+
358+
*osContext->getResidencyController().getMonitoredFence().cpuAddress = cpuValue;
359+
EXPECT_FALSE(wddm->isGpuHangDetected(*osContext));
360+
}
361+
362+
TEST_F(Wddm20WithMockGdiDllTests, GivenGpuHangIndicationWhenCheckingForGpuHangThenTrueIsReturned) {
363+
*osContext->getResidencyController().getMonitoredFence().cpuAddress = NEO::Wddm::gpuHangIndication;
364+
EXPECT_TRUE(wddm->isGpuHangDetected(*osContext));
365+
}
366+
347367
TEST_F(Wddm20WithMockGdiDllTests, GivenThreeOsHandlesWhenAskedForDestroyAllocationsThenAllMarkedAllocationsAreDestroyed) {
348368
OsHandleStorage storage;
349369
OsHandleWin osHandle1;

shared/source/command_stream/command_stream_receiver.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,7 +243,7 @@ bool CommandStreamReceiver::skipResourceCleanup() const {
243243
}
244244

245245
bool CommandStreamReceiver::isGpuHangDetected() const {
246-
return this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(osContext->getContextId());
246+
return this->osContext && this->getOSInterface() && this->getOSInterface()->getDriverModel() && this->getOSInterface()->getDriverModel()->isGpuHangDetected(*osContext);
247247
}
248248

249249
void CommandStreamReceiver::cleanupResources() {

shared/source/os_interface/linux/drm_neo.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -318,11 +318,8 @@ int Drm::queryGttSize(uint64_t &gttSizeOutput) {
318318
return ret;
319319
}
320320

321-
bool Drm::isGpuHangDetected(uint32_t contextId) {
322-
const auto &engines = this->rootDeviceEnvironment.executionEnvironment.memoryManager->getRegisteredEngines();
323-
UNRECOVERABLE_IF(engines.size() <= contextId);
324-
325-
const auto osContextLinux = static_cast<OsContextLinux *>(engines[contextId].osContext);
321+
bool Drm::isGpuHangDetected(OsContext &osContext) {
322+
const auto osContextLinux = static_cast<OsContextLinux *>(&osContext);
326323
const auto &drmContextIds = osContextLinux->getDrmContextIds();
327324

328325
for (const auto drmContextId : drmContextIds) {

shared/source/os_interface/linux/drm_neo.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ class Drm : public DriverModel {
148148
MOCKABLE_VIRTUAL void getPrelimVersion(std::string &prelimVersion);
149149

150150
PhysicalDevicePciBusInfo getPciBusInfo() const override;
151-
bool isGpuHangDetected(uint32_t contextId) override;
151+
bool isGpuHangDetected(OsContext &osContext) override;
152152

153153
bool areNonPersistentContextsSupported() const { return nonPersistentContextsSupported; }
154154
void checkNonPersistentContextsSupport();

shared/source/os_interface/os_interface.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
namespace NEO {
2020
class ExecutionEnvironment;
2121
class MemoryManager;
22+
class OsContext;
2223

2324
class HwDeviceId : public NonCopyableClass {
2425
public:
@@ -85,9 +86,7 @@ class DriverModel : public NonCopyableClass {
8586
return false;
8687
}
8788

88-
virtual bool isGpuHangDetected(uint32_t contextId) {
89-
return false;
90-
}
89+
virtual bool isGpuHangDetected(OsContext &osContext) = 0;
9190

9291
protected:
9392
DriverModelType driverModelType;

shared/source/os_interface/windows/wddm/wddm.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,13 @@ bool Wddm::waitFromCpu(uint64_t lastFenceValue, const MonitoredFence &monitoredF
918918
return status == STATUS_SUCCESS;
919919
}
920920

921+
bool Wddm::isGpuHangDetected(OsContext &osContext) {
922+
const auto osContextWin = static_cast<OsContextWin *>(&osContext);
923+
const auto &monitoredFence = osContextWin->getResidencyController().getMonitoredFence();
924+
925+
return monitoredFence.cpuAddress && *monitoredFence.cpuAddress == gpuHangIndication;
926+
}
927+
921928
void Wddm::initGfxPartition(GfxPartition &outGfxPartition, uint32_t rootDeviceIndex, size_t numRootDevices, bool useExternalFrontWindowPool) const {
922929
if (gfxPartition.SVM.Limit != 0) {
923930
outGfxPartition.heapInit(HeapIndex::HEAP_SVM, gfxPartition.SVM.Base, gfxPartition.SVM.Limit - gfxPartition.SVM.Base + 1);

shared/source/os_interface/windows/wddm/wddm.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323

2424
#include "sku_info.h"
2525

26+
#include <cstdint>
27+
#include <limits>
2628
#include <memory>
2729
#include <mutex>
2830

@@ -57,6 +59,7 @@ CREATECONTEXT_PVTDATA initPrivateData(OsContextWin &osContext);
5759
class Wddm : public DriverModel {
5860
public:
5961
static constexpr DriverModelType driverModelType = DriverModelType::WDDM;
62+
static constexpr std::uint64_t gpuHangIndication{std::numeric_limits<std::uint64_t>::max()};
6063

6164
typedef HRESULT(WINAPI *CreateDXGIFactoryFcn)(REFIID riid, void **ppFactory);
6265
typedef HRESULT(WINAPI *DXCoreCreateAdapterFactoryFcn)(REFIID riid, void **ppFactory);
@@ -109,6 +112,8 @@ class Wddm : public DriverModel {
109112

110113
MOCKABLE_VIRTUAL bool isShutdownInProgress();
111114

115+
bool isGpuHangDetected(OsContext &osContext) override;
116+
112117
bool configureDeviceAddressSpace();
113118
const FeatureTable &getFeatureTable() const {
114119
return *featureTable;

0 commit comments

Comments
 (0)