1+ #
2+ # Copyright (C) 2023 Intel Corporation
3+ #
4+ # SPDX-License-Identifier: MIT
5+ #
6+ # See YaML.md for syntax definition
7+ #
8+ --- # --------------------------------------------------------------------------
9+ type : header
10+ desc : " Intel $OneApi Level-Zero Sysman Extension APIs for RAS Get State and Clear State"
11+ version : " 1.7"
12+ --- # --------------------------------------------------------------------------
13+ type : macro
14+ desc : " RAS Get State Extension Name"
15+ version : " 1.7"
16+ name : $S_RAS_GET_STATE_EXP_NAME
17+ value : ' "$XS_extension_ras_state"'
18+ --- # --------------------------------------------------------------------------
19+ type : enum
20+ desc : " RAS Get State Extension Version(s)"
21+ version : " 1.7"
22+ name : $s_ras_state_exp_version_t
23+ etors :
24+ - name : " 1_0"
25+ value : " $X_MAKE_VERSION( 1, 0 )"
26+ desc : " version 1.0"
27+ --- # --------------------------------------------------------------------------
28+ type : enum
29+ desc : " RAS error categories"
30+ version : " 1.7"
31+ class : $sRas
32+ name : $s_ras_error_category_exp_t
33+ etors :
34+ - name : RESET
35+ value : " 0"
36+ desc : " The number of accelerator engine resets attempted by the driver"
37+ version : " 1.7"
38+ - name : PROGRAMMING_ERRORS
39+ desc : " The number of hardware exceptions generated by the way workloads have programmed the hardware"
40+ version : " 1.7"
41+ - name : DRIVER_ERRORS
42+ desc : " The number of low level driver communication errors have occurred"
43+ version : " 1.7"
44+ - name : COMPUTE_ERRORS
45+ desc : " The number of errors that have occurred in the compute accelerator hardware"
46+ version : " 1.7"
47+ - name : NON_COMPUTE_ERRORS
48+ desc : " The number of errors that have occurred in the fixed-function accelerator hardware"
49+ version : " 1.7"
50+ - name : CACHE_ERRORS
51+ desc : " The number of errors that have occurred in caches (L1/L3/register file/shared local memory/sampler)"
52+ version : " 1.7"
53+ - name : DISPLAY_ERRORS
54+ desc : " The number of errors that have occurred in the display"
55+ version : " 1.7"
56+ - name : MEMORY_ERRORS
57+ desc : " The number of errors that have occurred in Memory"
58+ version : " 1.7"
59+ - name : SCALE_ERRORS
60+ desc : " The number of errors that have occurred in Scale Fabric"
61+ version : " 1.7"
62+ - name : L3FABRIC_ERRORS
63+ desc : " The number of errors that have occurred in L3 Fabric"
64+ version : " 1.7"
65+ --- # --------------------------------------------------------------------------
66+ type : struct
67+ desc : " Extension structure for providing RAS error counters for different error sets"
68+ version : " 1.7"
69+ class : $sRas
70+ name : $s_ras_state_exp_t
71+ members :
72+ - type : $s_ras_error_category_exp_t
73+ name : " category"
74+ desc : " [out] category for which error counter is provided."
75+ - type : uint64_t
76+ name : errorCounter
77+ desc : " [out] Current value of RAS counter for specific error category."
78+ --- # --------------------------------------------------------------------------
79+ type : function
80+ desc : " Ras Get State"
81+ class : $sRas
82+ name : GetStateExp
83+ details :
84+ - " This function retrieves error counters for different RAS error categories."
85+ - " The application may call this function from simultaneous threads."
86+ - " The implementation of this function should be lock-free."
87+ params :
88+ - type : $s_ras_handle_t
89+ name : hRas
90+ desc : " [in] Handle for the component."
91+ - type : uint32_t*
92+ name : pCount
93+ desc : |
94+ [in,out] pointer to the number of RAS state structures that can be retrieved.
95+ if count is zero, then the driver shall update the value with the total number of error categories for which state can be retrieved.
96+ if count is greater than the number of RAS states available, then the driver shall update the value with the correct number of RAS states available.
97+ - type : $s_ras_state_exp_t*
98+ name : pState
99+ desc : |
100+ [in,out][optional][range(0, *pCount)] array of query results for RAS error states for different categories.
101+ if count is less than the number of RAS states available, then driver shall only retrieve that number of RAS states.
102+ --- # --------------------------------------------------------------------------
103+ type : function
104+ desc : " Ras Clear State"
105+ class : $sRas
106+ name : ClearStateExp
107+ details :
108+ - " This function clears error counters for a RAS error category."
109+ - " Clearing errors will affect other threads/applications - the counter values will start from zero."
110+ - " Clearing errors requires write permissions."
111+ - " The application should not call this function from simultaneous threads."
112+ - " The implementation of this function should be lock-free."
113+ params :
114+ - type : $s_ras_handle_t
115+ name : hRas
116+ desc : " [in] Handle for the component."
117+ - type : $s_ras_error_category_exp_t
118+ name : " category"
119+ desc : " [in] category for which error counter is to be cleared."
120+ returns :
121+ - $X_RESULT_ERROR_INSUFFICIENT_PERMISSIONS :
122+ - " Don't have permissions to clear error counters."
0 commit comments