Skip to content

Commit b39aafe

Browse files
feature(sysman): Add Support for ras exp API's
Related-To: NEO-8839, NEO-8873 Signed-off-by: Bellekallu Rajkiran <[email protected]>
1 parent c339e57 commit b39aafe

File tree

16 files changed

+1389
-40
lines changed

16 files changed

+1389
-40
lines changed

level_zero/api/sysman/zes_sysman_api_entrypoints.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1153,17 +1153,21 @@ ze_result_t zesRasGetStateExp(
11531153
zes_ras_handle_t hRas,
11541154
uint32_t *pCount,
11551155
zes_ras_state_exp_t *pState) {
1156-
if (L0::Sysman::sysmanOnlyInit) {
1156+
if (L0::sysmanInitFromCore) {
1157+
return L0::Ras::fromHandle(hRas)->rasGetStateExp(pCount, pState);
1158+
} else if (L0::Sysman::sysmanOnlyInit) {
11571159
return L0::Sysman::Ras::fromHandle(hRas)->rasGetStateExp(pCount, pState);
11581160
} else {
1159-
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
1161+
return ZE_RESULT_ERROR_UNINITIALIZED;
11601162
}
11611163
}
11621164

11631165
ze_result_t zesRasClearStateExp(
11641166
zes_ras_handle_t hRas,
11651167
zes_ras_error_category_exp_t category) {
1166-
if (L0::Sysman::sysmanOnlyInit) {
1168+
if (L0::sysmanInitFromCore) {
1169+
return L0::Ras::fromHandle(hRas)->rasClearStateExp(category);
1170+
} else if (L0::Sysman::sysmanOnlyInit) {
11671171
return L0::Sysman::Ras::fromHandle(hRas)->rasClearStateExp(category);
11681172
} else {
11691173
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;

level_zero/sysman/test/unit_tests/sources/ras/linux/test_zes_ras.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,7 @@ struct SysmanRasMultiDeviceFixture : public SysmanMultiDeviceFixture {
630630
}
631631
};
632632
HWTEST2_F(SysmanRasMultiDeviceFixture, GivenValidSysmanHandleWithMultiDeviceWhenRetrievingRasHandlesThenSuccessIsReturned, IsGtRasSupportedProduct) {
633+
633634
L0::Sysman::RasHandleContext *pRasHandleContext = new L0::Sysman::RasHandleContext(pSysmanDeviceImp->pOsSysman);
634635
uint32_t count = 0;
635636
ze_result_t result = pRasHandleContext->rasGet(&count, nullptr);

level_zero/tools/source/sysman/ras/linux/os_ras_imp.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include "drm/intel_hwconfig_types.h"
1717

18+
#include <algorithm>
1819
namespace L0 {
1920

2021
static bool isMemoryTypeHbm(LinuxSysmanImp *pLinuxSysmanImp) {
@@ -84,6 +85,62 @@ ze_result_t LinuxRasImp::osRasGetState(zes_ras_state_t &state, ze_bool_t clear)
8485
return result;
8586
}
8687

88+
ze_result_t LinuxRasImp::osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) {
89+
ze_result_t result = ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
90+
uint32_t totalCategoryCount = 0;
91+
std::vector<uint32_t> numCategoriesBySources = {};
92+
for (auto &rasSource : rasSources) {
93+
totalCategoryCount += rasSource->osRasGetCategoryCount();
94+
numCategoriesBySources.push_back(totalCategoryCount);
95+
}
96+
97+
if (*pCount == 0) {
98+
*pCount = totalCategoryCount;
99+
return ZE_RESULT_SUCCESS;
100+
}
101+
102+
uint32_t remainingCategories = std::min(totalCategoryCount, *pCount);
103+
uint32_t numCategoriesAssigned = 0u;
104+
for (uint32_t rasSourceIdx = 0u; rasSourceIdx < rasSources.size(); rasSourceIdx++) {
105+
auto &rasSource = rasSources[rasSourceIdx];
106+
uint32_t numCategoriesRequested = std::min(remainingCategories, numCategoriesBySources[rasSourceIdx]);
107+
ze_result_t localResult = rasSource->osRasGetStateExp(numCategoriesRequested, &pState[numCategoriesAssigned]);
108+
if (localResult != ZE_RESULT_SUCCESS) {
109+
continue;
110+
}
111+
remainingCategories -= numCategoriesRequested;
112+
numCategoriesAssigned += numCategoriesBySources[rasSourceIdx];
113+
result = localResult;
114+
if (remainingCategories == 0u) {
115+
break;
116+
}
117+
}
118+
return result;
119+
}
120+
121+
ze_result_t LinuxRasImp::osRasClearStateExp(zes_ras_error_category_exp_t category) {
122+
if (pFsAccess->isRootUser() == false) {
123+
NEO::printDebugString(NEO::debugManager.flags.PrintDebugMessages.get(), stderr, "Error@ %s(): Insufficient permissions and returning error:0x%x \n", __FUNCTION__, ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS);
124+
return ZE_RESULT_ERROR_INSUFFICIENT_PERMISSIONS;
125+
}
126+
127+
if (ZES_RAS_ERROR_CATEGORY_EXP_L3FABRIC_ERRORS < category) {
128+
return ZE_RESULT_ERROR_INVALID_ENUMERATION;
129+
}
130+
131+
ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
132+
for (auto &rasSource : rasSources) {
133+
result = rasSource->osRasClearStateExp(category);
134+
if (result != ZE_RESULT_SUCCESS) {
135+
if (result == ZE_RESULT_ERROR_NOT_AVAILABLE) {
136+
continue;
137+
}
138+
return result;
139+
}
140+
}
141+
return result;
142+
}
143+
87144
void LinuxRasImp::initSources() {
88145
rasSources.push_back(std::make_unique<L0::LinuxRasSourceGt>(pLinuxSysmanImp, osRasErrorType, isSubdevice, subdeviceId));
89146
if (isMemoryTypeHbm(pLinuxSysmanImp) == true) {

level_zero/tools/source/sysman/ras/linux/os_ras_imp.h

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
3131
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
3232
ze_result_t osRasGetConfig(zes_ras_config_t *config) override;
3333
ze_result_t osRasSetConfig(const zes_ras_config_t *config) override;
34+
ze_result_t osRasGetStateExp(uint32_t *pCount, zes_ras_state_exp_t *pState) override;
35+
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
3436
LinuxRasImp(OsSysman *pOsSysman, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
3537
LinuxRasImp() = default;
3638
~LinuxRasImp() override = default;
@@ -52,13 +54,19 @@ class LinuxRasImp : public OsRas, NEO::NonCopyableOrMovableClass {
5254
class LinuxRasSources : NEO::NonCopyableOrMovableClass {
5355
public:
5456
virtual ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) = 0;
57+
virtual ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) = 0;
58+
virtual uint32_t osRasGetCategoryCount() = 0;
59+
virtual ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) = 0;
5560
virtual ~LinuxRasSources() = default;
5661
};
5762

5863
class LinuxRasSourceGt : public LinuxRasSources {
5964
public:
6065
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
66+
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
67+
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
6168
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
69+
uint32_t osRasGetCategoryCount() override;
6270
LinuxRasSourceGt(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, ze_bool_t onSubdevice, uint32_t subdeviceId);
6371
LinuxRasSourceGt() = default;
6472
~LinuxRasSourceGt() override;
@@ -82,24 +90,31 @@ class LinuxRasSourceGt : public LinuxRasSources {
8290
const std::string &errorCounterDir,
8391
uint64_t &errorVal);
8492
void closeFds();
93+
bool getAbsoluteCount(zes_ras_error_category_exp_t category) {
94+
return !(clearStatus & (1 << category));
95+
}
8596
int64_t groupFd = -1;
8697
std::vector<int64_t> memberFds = {};
8798
uint64_t initialErrorCount[maxRasErrorCategoryCount] = {0};
88-
std::map<zes_ras_error_cat_t, uint64_t> errorCategoryToEventCount;
89-
uint64_t totalEventCount = 0;
99+
uint32_t clearStatus = 0;
100+
std::map<zes_ras_error_category_exp_t, uint64_t> errorCategoryToEventCount;
90101
bool isSubdevice = false;
91102
uint32_t subdeviceId = 0;
92103
};
93104

94105
class LinuxRasSourceHbm : public LinuxRasSources {
95106
public:
96107
ze_result_t osRasGetState(zes_ras_state_t &state, ze_bool_t clear) override;
108+
ze_result_t osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) override;
109+
ze_result_t osRasClearStateExp(zes_ras_error_category_exp_t category) override;
97110
static void getSupportedRasErrorTypes(std::set<zes_ras_error_type_t> &errorType, OsSysman *pOsSysman, ze_device_handle_t deviceHandle);
111+
uint32_t osRasGetCategoryCount() override;
98112
LinuxRasSourceHbm(LinuxSysmanImp *pLinuxSysmanImp, zes_ras_error_type_t type, uint32_t subdeviceId);
99113
LinuxRasSourceHbm() = default;
100114
~LinuxRasSourceHbm() override{};
101115

102116
protected:
117+
ze_result_t getMemoryErrorCountFromFw(zes_ras_error_type_t rasErrorType, uint32_t subDeviceCount, uint64_t &errorCount);
103118
LinuxSysmanImp *pLinuxSysmanImp = nullptr;
104119
zes_ras_error_type_t osRasErrorType = {};
105120
FirmwareUtil *pFwInterface = nullptr;
@@ -108,6 +123,7 @@ class LinuxRasSourceHbm : public LinuxRasSources {
108123
private:
109124
uint64_t errorBaseline = 0;
110125
uint32_t subdeviceId = 0;
126+
uint32_t subDeviceCount = 0;
111127
};
112128

113129
} // namespace L0

level_zero/tools/source/sysman/ras/linux/os_ras_imp_gt.cpp

Lines changed: 68 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,16 @@
1212
#include "level_zero/tools/source/sysman/sysman_imp.h"
1313

1414
namespace L0 {
15-
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
16-
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
15+
static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsUncorrectable = {
16+
{ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
1717
{"fatal-array-bist", "fatal-idi-parity", "fatal-l3-double",
1818
"fatal-l3-ecc-checker",
1919
"fatal-sqidi", "fatal-tlb", "fatal-l3bank"}},
20-
{ZES_RAS_ERROR_CAT_RESET,
20+
{ZES_RAS_ERROR_CATEGORY_EXP_RESET,
2121
{"engine-reset"}},
22-
{ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS,
22+
{ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS,
2323
{"eu-attention"}},
24-
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
24+
{ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
2525
{"soc-fatal-psf-0", "soc-fatal-psf-1", "soc-fatal-psf-2", "soc-fatal-psf-csc-0",
2626
"soc-fatal-psf-csc-1", "soc-fatal-psf-csc-2", "soc-fatal-punit",
2727
"sgunit-fatal", "soc-nonfatal-punit", "sgunit-fatal", "sgunit-nonfatal", "gsc-nonfatal-mia-shutdown",
@@ -30,20 +30,20 @@ static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToL
3030
"gsc-nonfatal-ucode-parity", "gsc-nonfatal-mia-int", "gsc-nonfatal-wdg-timeout", "soc-fatal-mdfi-east",
3131
"soc-fatal-mdfi-south", "soc-nonfatal-mdfi-east", "soc-nonfatal-mdfi-south", "soc-fatal-mdfi-west",
3232
"soc-fatal-cd0-mdfi", "soc-nonfatal-cd0-mdfi"}},
33-
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
33+
{ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
3434
{"fatal-fpu", "fatal-eu-grf", "fatal-sampler", "fatal-slm",
3535
"fatal-guc", "fatal-eu-ic", "fatal-subslice", "fatal-l3-fabric"}},
36-
{ZES_RAS_ERROR_CAT_DRIVER_ERRORS,
36+
{ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS,
3737
{"driver-object-migration", "driver-engine-other", "driver-ggtt",
3838
"driver-gt-interrupt", "driver-gt-other", "driver-guc-communication",
3939
"driver-rps"}}};
4040

41-
static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
42-
{ZES_RAS_ERROR_CAT_CACHE_ERRORS,
41+
static const std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEventsCorrectable = {
42+
{ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS,
4343
{"correctable-l3-sng", "correctable-l3bank"}},
44-
{ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS,
44+
{ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS,
4545
{"sgunit-correctable", "gsc-correctable-sram-ecc"}},
46-
{ZES_RAS_ERROR_CAT_COMPUTE_ERRORS,
46+
{ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS,
4747
{"correctable-eu-grf", "correctable-eu-ic", "correctable-guc", "correctable-sampler", "correctable-slm", "correctable-subslice"}}};
4848

4949
static void closeFd(int64_t &fd) {
@@ -93,7 +93,7 @@ static uint64_t convertHexToUint64(std::string strVal) {
9393
return config;
9494
}
9595

96-
static bool getErrorType(std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
96+
static bool getErrorType(std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
9797
ze_bool_t onSubDevice = false;
9898
uint32_t subDeviceId = 0;
9999
SysmanDeviceImp::getSysmanDeviceInfo(deviceHandle, subDeviceId, onSubDevice, true);
@@ -149,7 +149,6 @@ void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
149149
ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t clear) {
150150
if (clear == true) {
151151
closeFds();
152-
totalEventCount = 0;
153152
memset(state.category, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
154153
memset(initialErrorCount, 0, maxRasErrorCategoryCount * sizeof(uint64_t));
155154
}
@@ -160,14 +159,8 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
160159
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
161160
}
162161

163-
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToEvent;
164-
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
165-
categoryToEvent = categoryToListOfEventsCorrectable;
166-
}
167-
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
168-
categoryToEvent = categoryToListOfEventsUncorrectable;
169-
}
170-
std::vector<std::uint64_t> data(2 + totalEventCount, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
162+
auto numEvents = memberFds.size() + 1; // Add 1 for group Fd
163+
std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
171164
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
172165
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
173166
}
@@ -186,6 +179,57 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
186179
return ZE_RESULT_SUCCESS;
187180
}
188181

182+
ze_result_t LinuxRasSourceGt::osRasGetStateExp(uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
183+
initRasErrors(false);
184+
// Iterate over all the file descriptor values present in vector which is mapped to given ras error category
185+
// Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
186+
if (groupFd < 0) {
187+
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
188+
}
189+
190+
auto numEvents = memberFds.size() + 1; // Add 1 for group Fd
191+
std::vector<std::uint64_t> data(2 + numEvents, 0); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
192+
if (pPmuInterface->pmuRead(static_cast<int>(groupFd), data.data(), sizeof(uint64_t) * data.size()) < 0) {
193+
return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
194+
}
195+
196+
/* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
197+
uint64_t initialIndex = 2; // Initial index in the buffer from which the data be parsed begins
198+
uint32_t categoryIdx = 0u;
199+
for (auto errorCat = errorCategoryToEventCount.begin(); (errorCat != errorCategoryToEventCount.end()) && (categoryIdx < numCategoriesRequested); errorCat++) {
200+
uint64_t errorCount = 0;
201+
uint64_t j = 0;
202+
for (; j < errorCat->second; j++) {
203+
errorCount += data[initialIndex + j];
204+
}
205+
pState[categoryIdx].category = errorCat->first;
206+
pState[categoryIdx].errorCounter = errorCount + initialErrorCount[errorCat->first];
207+
initialIndex += j;
208+
categoryIdx++;
209+
}
210+
211+
return ZE_RESULT_SUCCESS;
212+
}
213+
214+
ze_result_t LinuxRasSourceGt::osRasClearStateExp(zes_ras_error_category_exp_t category) {
215+
ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
216+
// check requested category is already initialized
217+
if (errorCategoryToEventCount.find(category) != errorCategoryToEventCount.end()) {
218+
closeFds();
219+
clearStatus |= (1 << category);
220+
initialErrorCount[category] = 0;
221+
result = ZE_RESULT_SUCCESS;
222+
}
223+
return result;
224+
}
225+
226+
uint32_t LinuxRasSourceGt::osRasGetCategoryCount() {
227+
if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
228+
return static_cast<uint32_t>(categoryToListOfEventsUncorrectable.size());
229+
}
230+
return static_cast<uint32_t>(categoryToListOfEventsCorrectable.size());
231+
}
232+
189233
ze_result_t LinuxRasSourceGt::getPmuConfig(
190234
const std::string &eventDirectory,
191235
const std::vector<std::string> &listOfEvents,
@@ -220,7 +264,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
220264
if (result != ZE_RESULT_SUCCESS) {
221265
return;
222266
}
223-
std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToListOfEvents;
267+
std::map<zes_ras_error_category_exp_t, std::vector<std::string>> categoryToListOfEvents;
224268
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
225269
categoryToListOfEvents = categoryToListOfEventsCorrectable;
226270
}
@@ -251,7 +295,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
251295
errorPrefixLocal = "error--";
252296
}
253297
uint64_t initialErrorVal = 0;
254-
if (clear == false) {
298+
if ((clear == false) && (getAbsoluteCount(rasErrorCatToListOfEvents.first) == true)) {
255299
result = getBootUpErrorCountFromSysfs(nameOfError, errorCounterDirLocal, initialErrorVal);
256300
if (result != ZE_RESULT_SUCCESS) {
257301
continue;
@@ -275,9 +319,9 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
275319
eventCount++;
276320
errorCount += initialErrorVal;
277321
}
322+
clearStatus &= ~(1 << rasErrorCatToListOfEvents.first);
278323
initialErrorCount[rasErrorCatToListOfEvents.first] = errorCount;
279324
errorCategoryToEventCount[rasErrorCatToListOfEvents.first] = eventCount;
280-
totalEventCount += eventCount;
281325
}
282326
}
283327

0 commit comments

Comments
 (0)