Skip to content

Commit 2c7cd28

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: gpu recovers from fatal error in poison mode
Fatal error occurs in ras poison mode, mode1 reset is used to recover gpu. Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 50a7c87 commit 2c7cd28

File tree

2 files changed

+12
-0
lines changed

2 files changed

+12
-0
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2065,6 +2065,14 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
20652065
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
20662066
reset_context.method = AMD_RESET_METHOD_MODE2;
20672067
}
2068+
2069+
/* Fatal error occurs in poison mode, mode1 reset is used to
2070+
* recover gpu.
2071+
*/
2072+
if (ras->gpu_reset_flags & AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
2073+
ras->gpu_reset_flags &= ~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
2074+
set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
2075+
}
20682076
}
20692077

20702078
amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
@@ -2955,9 +2963,12 @@ void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev)
29552963
return;
29562964

29572965
if (atomic_cmpxchg(&amdgpu_ras_in_intr, 0, 1) == 0) {
2966+
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
2967+
29582968
dev_info(adev->dev, "uncorrectable hardware error"
29592969
"(ERREVENT_ATHUB_INTERRUPT) detected!\n");
29602970

2971+
ras->gpu_reset_flags |= AMDGPU_RAS_GPU_RESET_MODE1_RESET;
29612972
amdgpu_ras_reset_gpu(adev);
29622973
}
29632974
}

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,7 @@ enum amdgpu_ras_ret {
340340
#define AMDGPU_RAS_ERR_ADDRESS_VALID (1 << 2)
341341

342342
#define AMDGPU_RAS_GPU_RESET_MODE2_RESET (0x1 << 0)
343+
#define AMDGPU_RAS_GPU_RESET_MODE1_RESET (0x1 << 1)
343344

344345
struct amdgpu_ras_err_status_reg_entry {
345346
uint32_t hwip;

0 commit comments

Comments
 (0)