12
12
#include " level_zero/tools/source/sysman/sysman_imp.h"
13
13
14
14
namespace L0 {
15
- static const std::map<zes_ras_error_cat_t , std::vector<std::string>> categoryToListOfEventsUncorrectable = {
16
- {ZES_RAS_ERROR_CAT_CACHE_ERRORS ,
15
+ static const std::map<zes_ras_error_category_exp_t , std::vector<std::string>> categoryToListOfEventsUncorrectable = {
16
+ {ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS ,
17
17
{" fatal-array-bist" , " fatal-idi-parity" , " fatal-l3-double" ,
18
18
" fatal-l3-ecc-checker" ,
19
19
" fatal-sqidi" , " fatal-tlb" , " fatal-l3bank" }},
20
- {ZES_RAS_ERROR_CAT_RESET ,
20
+ {ZES_RAS_ERROR_CATEGORY_EXP_RESET ,
21
21
{" engine-reset" }},
22
- {ZES_RAS_ERROR_CAT_PROGRAMMING_ERRORS ,
22
+ {ZES_RAS_ERROR_CATEGORY_EXP_PROGRAMMING_ERRORS ,
23
23
{" eu-attention" }},
24
- {ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS ,
24
+ {ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS ,
25
25
{" soc-fatal-psf-0" , " soc-fatal-psf-1" , " soc-fatal-psf-2" , " soc-fatal-psf-csc-0" ,
26
26
" soc-fatal-psf-csc-1" , " soc-fatal-psf-csc-2" , " soc-fatal-punit" ,
27
27
" sgunit-fatal" , " soc-nonfatal-punit" , " sgunit-fatal" , " sgunit-nonfatal" , " gsc-nonfatal-mia-shutdown" ,
@@ -30,20 +30,20 @@ static const std::map<zes_ras_error_cat_t, std::vector<std::string>> categoryToL
30
30
" gsc-nonfatal-ucode-parity" , " gsc-nonfatal-mia-int" , " gsc-nonfatal-wdg-timeout" , " soc-fatal-mdfi-east" ,
31
31
" soc-fatal-mdfi-south" , " soc-nonfatal-mdfi-east" , " soc-nonfatal-mdfi-south" , " soc-fatal-mdfi-west" ,
32
32
" soc-fatal-cd0-mdfi" , " soc-nonfatal-cd0-mdfi" }},
33
- {ZES_RAS_ERROR_CAT_COMPUTE_ERRORS ,
33
+ {ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS ,
34
34
{" fatal-fpu" , " fatal-eu-grf" , " fatal-sampler" , " fatal-slm" ,
35
35
" fatal-guc" , " fatal-eu-ic" , " fatal-subslice" , " fatal-l3-fabric" }},
36
- {ZES_RAS_ERROR_CAT_DRIVER_ERRORS ,
36
+ {ZES_RAS_ERROR_CATEGORY_EXP_DRIVER_ERRORS ,
37
37
{" driver-object-migration" , " driver-engine-other" , " driver-ggtt" ,
38
38
" driver-gt-interrupt" , " driver-gt-other" , " driver-guc-communication" ,
39
39
" driver-rps" }}};
40
40
41
- static const std::map<zes_ras_error_cat_t , std::vector<std::string>> categoryToListOfEventsCorrectable = {
42
- {ZES_RAS_ERROR_CAT_CACHE_ERRORS ,
41
+ static const std::map<zes_ras_error_category_exp_t , std::vector<std::string>> categoryToListOfEventsCorrectable = {
42
+ {ZES_RAS_ERROR_CATEGORY_EXP_CACHE_ERRORS ,
43
43
{" correctable-l3-sng" , " correctable-l3bank" }},
44
- {ZES_RAS_ERROR_CAT_NON_COMPUTE_ERRORS ,
44
+ {ZES_RAS_ERROR_CATEGORY_EXP_NON_COMPUTE_ERRORS ,
45
45
{" sgunit-correctable" , " gsc-correctable-sram-ecc" }},
46
- {ZES_RAS_ERROR_CAT_COMPUTE_ERRORS ,
46
+ {ZES_RAS_ERROR_CATEGORY_EXP_COMPUTE_ERRORS ,
47
47
{" correctable-eu-grf" , " correctable-eu-ic" , " correctable-guc" , " correctable-sampler" , " correctable-slm" , " correctable-subslice" }}};
48
48
49
49
static void closeFd (int64_t &fd) {
@@ -93,7 +93,7 @@ static uint64_t convertHexToUint64(std::string strVal) {
93
93
return config;
94
94
}
95
95
96
- static bool getErrorType (std::map<zes_ras_error_cat_t , std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
96
+ static bool getErrorType (std::map<zes_ras_error_category_exp_t , std::vector<std::string>> categoryToListOfEvents, std::vector<std::string> &eventList, ze_device_handle_t deviceHandle) {
97
97
ze_bool_t onSubDevice = false ;
98
98
uint32_t subDeviceId = 0 ;
99
99
SysmanDeviceImp::getSysmanDeviceInfo (deviceHandle, subDeviceId, onSubDevice, true );
@@ -149,7 +149,6 @@ void LinuxRasSourceGt::getSupportedRasErrorTypes(std::set<zes_ras_error_type_t>
149
149
ze_result_t LinuxRasSourceGt::osRasGetState (zes_ras_state_t &state, ze_bool_t clear) {
150
150
if (clear == true ) {
151
151
closeFds ();
152
- totalEventCount = 0 ;
153
152
memset (state.category , 0 , maxRasErrorCategoryCount * sizeof (uint64_t ));
154
153
memset (initialErrorCount, 0 , maxRasErrorCategoryCount * sizeof (uint64_t ));
155
154
}
@@ -160,14 +159,8 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
160
159
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
161
160
}
162
161
163
- std::map<zes_ras_error_cat_t , std::vector<std::string>> categoryToEvent;
164
- if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
165
- categoryToEvent = categoryToListOfEventsCorrectable;
166
- }
167
- if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
168
- categoryToEvent = categoryToListOfEventsUncorrectable;
169
- }
170
- std::vector<std::uint64_t > data (2 + totalEventCount, 0 ); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
162
+ auto numEvents = memberFds.size () + 1 ; // Add 1 for group Fd
163
+ std::vector<std::uint64_t > data (2 + numEvents, 0 ); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
171
164
if (pPmuInterface->pmuRead (static_cast <int >(groupFd), data.data (), sizeof (uint64_t ) * data.size ()) < 0 ) {
172
165
return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE;
173
166
}
@@ -186,6 +179,57 @@ ze_result_t LinuxRasSourceGt::osRasGetState(zes_ras_state_t &state, ze_bool_t cl
186
179
return ZE_RESULT_SUCCESS;
187
180
}
188
181
182
+ ze_result_t LinuxRasSourceGt::osRasGetStateExp (uint32_t numCategoriesRequested, zes_ras_state_exp_t *pState) {
183
+ initRasErrors (false );
184
+ // Iterate over all the file descriptor values present in vector which is mapped to given ras error category
185
+ // Use the file descriptors to read pmu counters and add all the errors corresponding to the ras error category
186
+ if (groupFd < 0 ) {
187
+ return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
188
+ }
189
+
190
+ auto numEvents = memberFds.size () + 1 ; // Add 1 for group Fd
191
+ std::vector<std::uint64_t > data (2 + numEvents, 0 ); // In data[], event count starts from second index, first value gives number of events and second value is for timestamp
192
+ if (pPmuInterface->pmuRead (static_cast <int >(groupFd), data.data (), sizeof (uint64_t ) * data.size ()) < 0 ) {
193
+ return ZE_RESULT_ERROR_DEPENDENCY_UNAVAILABLE;
194
+ }
195
+
196
+ /* The data buffer retrieved after reading pmu counters is parsed to get the error count for each suberror category */
197
+ uint64_t initialIndex = 2 ; // Initial index in the buffer from which the data be parsed begins
198
+ uint32_t categoryIdx = 0u ;
199
+ for (auto errorCat = errorCategoryToEventCount.begin (); (errorCat != errorCategoryToEventCount.end ()) && (categoryIdx < numCategoriesRequested); errorCat++) {
200
+ uint64_t errorCount = 0 ;
201
+ uint64_t j = 0 ;
202
+ for (; j < errorCat->second ; j++) {
203
+ errorCount += data[initialIndex + j];
204
+ }
205
+ pState[categoryIdx].category = errorCat->first ;
206
+ pState[categoryIdx].errorCounter = errorCount + initialErrorCount[errorCat->first ];
207
+ initialIndex += j;
208
+ categoryIdx++;
209
+ }
210
+
211
+ return ZE_RESULT_SUCCESS;
212
+ }
213
+
214
+ ze_result_t LinuxRasSourceGt::osRasClearStateExp (zes_ras_error_category_exp_t category) {
215
+ ze_result_t result = ZE_RESULT_ERROR_NOT_AVAILABLE;
216
+ // check requested category is already initialized
217
+ if (errorCategoryToEventCount.find (category) != errorCategoryToEventCount.end ()) {
218
+ closeFds ();
219
+ clearStatus |= (1 << category);
220
+ initialErrorCount[category] = 0 ;
221
+ result = ZE_RESULT_SUCCESS;
222
+ }
223
+ return result;
224
+ }
225
+
226
+ uint32_t LinuxRasSourceGt::osRasGetCategoryCount () {
227
+ if (osRasErrorType == ZES_RAS_ERROR_TYPE_UNCORRECTABLE) {
228
+ return static_cast <uint32_t >(categoryToListOfEventsUncorrectable.size ());
229
+ }
230
+ return static_cast <uint32_t >(categoryToListOfEventsCorrectable.size ());
231
+ }
232
+
189
233
ze_result_t LinuxRasSourceGt::getPmuConfig (
190
234
const std::string &eventDirectory,
191
235
const std::vector<std::string> &listOfEvents,
@@ -220,7 +264,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
220
264
if (result != ZE_RESULT_SUCCESS) {
221
265
return ;
222
266
}
223
- std::map<zes_ras_error_cat_t , std::vector<std::string>> categoryToListOfEvents;
267
+ std::map<zes_ras_error_category_exp_t , std::vector<std::string>> categoryToListOfEvents;
224
268
if (osRasErrorType == ZES_RAS_ERROR_TYPE_CORRECTABLE) {
225
269
categoryToListOfEvents = categoryToListOfEventsCorrectable;
226
270
}
@@ -251,7 +295,7 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
251
295
errorPrefixLocal = " error--" ;
252
296
}
253
297
uint64_t initialErrorVal = 0 ;
254
- if (clear == false ) {
298
+ if (( clear == false ) && ( getAbsoluteCount (rasErrorCatToListOfEvents. first ) == true ) ) {
255
299
result = getBootUpErrorCountFromSysfs (nameOfError, errorCounterDirLocal, initialErrorVal);
256
300
if (result != ZE_RESULT_SUCCESS) {
257
301
continue ;
@@ -275,9 +319,9 @@ void LinuxRasSourceGt::initRasErrors(ze_bool_t clear) {
275
319
eventCount++;
276
320
errorCount += initialErrorVal;
277
321
}
322
+ clearStatus &= ~(1 << rasErrorCatToListOfEvents.first );
278
323
initialErrorCount[rasErrorCatToListOfEvents.first ] = errorCount;
279
324
errorCategoryToEventCount[rasErrorCatToListOfEvents.first ] = eventCount;
280
- totalEventCount += eventCount;
281
325
}
282
326
}
283
327
0 commit comments