Skip to content

Commit 43c4d57

Browse files
John Clementsalexdeucher
authored andcommitted
drm/amdgpu: protect RAS sysfs during GPU reset
MMHub EDC becomes dirty after BACO reset EDC registers should be cleared early on in reset phase Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: John Clements <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent cb7adfd commit 43c4d57

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2742,6 +2742,9 @@ static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
27422742

27432743
if (adev->asic_reset_res)
27442744
goto fail;
2745+
2746+
if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2747+
adev->mmhub.funcs->reset_ras_error_count(adev);
27452748
} else {
27462749

27472750
task_barrier_full(&hive->tb);
@@ -3910,8 +3913,15 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
39103913
}
39113914
}
39123915

3913-
if (!r && amdgpu_ras_intr_triggered())
3916+
if (!r && amdgpu_ras_intr_triggered()) {
3917+
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3918+
if (tmp_adev->mmhub.funcs &&
3919+
tmp_adev->mmhub.funcs->reset_ras_error_count)
3920+
tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
3921+
}
3922+
39143923
amdgpu_ras_intr_cleared();
3924+
}
39153925

39163926
list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
39173927
if (need_full_reset) {

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,11 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
281281
struct ras_debug_if data;
282282
int ret = 0;
283283

284+
if (amdgpu_ras_intr_triggered()) {
285+
DRM_WARN("RAS WARN: error injection currently inaccessible\n");
286+
return size;
287+
}
288+
284289
ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
285290
if (ret)
286291
return -EINVAL;
@@ -394,6 +399,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
394399
.head = obj->head,
395400
};
396401

402+
if (amdgpu_ras_intr_triggered())
403+
return snprintf(buf, PAGE_SIZE,
404+
"Query currently inaccessible\n");
405+
397406
if (amdgpu_ras_error_query(obj->adev, &info))
398407
return -EINVAL;
399408

0 commit comments

Comments
 (0)