Skip to content

Commit ceb47e0

Browse files
mukjoshialexdeucher
authored andcommitted
drm/amdgpu: Fix SDMA RAS error reporting on Aldebaran
Fix the following issues with SDMA RAS error reporting: 1. Read the EDC_COUNTER2 register also to fetch error counts for all sub-blocks in SDMA. 2. SDMA RAS on Aldebaran suports single-bit uncorrectable errors only. So, report error count in UE count instead of CE count. Signed-off-by: Mukul Joshi <[email protected]> Reviewed-By: John Clements <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 1f0d8e3 commit ceb47e0

File tree

1 file changed

+28
-7
lines changed

1 file changed

+28
-7
lines changed

drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,7 @@ static const struct soc15_ras_field_entry sdma_v4_4_ras_fields[] = {
160160
};
161161

162162
static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
163+
uint32_t reg_offset,
163164
uint32_t value,
164165
uint32_t instance,
165166
uint32_t *sec_count)
@@ -169,6 +170,9 @@ static void sdma_v4_4_get_ras_error_count(struct amdgpu_device *adev,
169170

170171
/* double bits error (multiple bits) error detection is not supported */
171172
for (i = 0; i < ARRAY_SIZE(sdma_v4_4_ras_fields); i++) {
173+
if (sdma_v4_4_ras_fields[i].reg_offset != reg_offset)
174+
continue;
175+
172176
/* the SDMA_EDC_COUNTER register in each sdma instance
173177
* shares the same sed shift_mask
174178
* */
@@ -197,13 +201,30 @@ static int sdma_v4_4_query_ras_error_count(struct amdgpu_device *adev,
197201
reg_value = RREG32(reg_offset);
198202
/* double bit error is not supported */
199203
if (reg_value)
200-
sdma_v4_4_get_ras_error_count(adev, reg_value, instance, &sec_count);
201-
/* err_data->ce_count should be initialized to 0
202-
* before calling into this function */
203-
err_data->ce_count += sec_count;
204-
/* double bit error is not supported
205-
* set ue count to 0 */
206-
err_data->ue_count = 0;
204+
sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER, reg_value,
205+
instance, &sec_count);
206+
207+
reg_offset = sdma_v4_4_get_reg_offset(adev, instance, regSDMA0_EDC_COUNTER2);
208+
reg_value = RREG32(reg_offset);
209+
/* double bit error is not supported */
210+
if (reg_value)
211+
sdma_v4_4_get_ras_error_count(adev, regSDMA0_EDC_COUNTER2, reg_value,
212+
instance, &sec_count);
213+
214+
/*
215+
* err_data->ue_count should be initialized to 0
216+
* before calling into this function
217+
*
218+
* SDMA RAS supports single bit uncorrectable error detection.
219+
* So, increment uncorrectable error count.
220+
*/
221+
err_data->ue_count += sec_count;
222+
223+
/*
224+
* SDMA RAS does not support correctable errors.
225+
* Set ce count to 0.
226+
*/
227+
err_data->ce_count = 0;
207228

208229
return 0;
209230
};

0 commit comments

Comments
 (0)