Skip to content

Commit 5e984b0

Browse files
Hawking Zhangalexdeucher
authored andcommitted
drm/amdgpu: Use driver mode reset for data poison
mode-2 reset is the only reliable method that can get GC/SDMA back when poison is consumed. mmhub requires mode-1 reset. Signed-off-by: Hawking Zhang <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 5adcd78 commit 5e984b0

File tree

1 file changed

+8
-19
lines changed

1 file changed

+8
-19
lines changed

drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
144144
uint16_t pasid, uint16_t client_id)
145145
{
146146
enum amdgpu_ras_block block = 0;
147-
int old_poison, ret = -EINVAL;
147+
int old_poison;
148148
uint32_t reset = 0;
149149
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
150150

@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
163163
case SOC15_IH_CLIENTID_SE2SH:
164164
case SOC15_IH_CLIENTID_SE3SH:
165165
case SOC15_IH_CLIENTID_UTCL2:
166-
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
167166
block = AMDGPU_RAS_BLOCK__GFX;
168-
if (ret)
169-
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
167+
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
170168
break;
171169
case SOC15_IH_CLIENTID_VMC:
172170
case SOC15_IH_CLIENTID_VMC1:
173-
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
174171
block = AMDGPU_RAS_BLOCK__MMHUB;
175-
if (ret)
176-
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
172+
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
177173
break;
178174
case SOC15_IH_CLIENTID_SDMA0:
179175
case SOC15_IH_CLIENTID_SDMA1:
@@ -184,22 +180,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
184180
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
185181
break;
186182
default:
187-
break;
183+
dev_warn(dev->adev->dev,
184+
"client %d does not support poison consumption\n", client_id);
185+
return;
188186
}
189187

190188
kfd_signal_poison_consumed_event(dev, pasid);
191189

192-
/* resetting queue passes, do page retirement without gpu reset
193-
* resetting queue fails, fallback to gpu reset solution
194-
*/
195-
if (!ret)
196-
dev_warn(dev->adev->dev,
197-
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
198-
client_id);
199-
else
200-
dev_warn(dev->adev->dev,
201-
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
202-
client_id);
190+
dev_warn(dev->adev->dev,
191+
"poison is consumed by client %d, kick off gpu reset flow\n", client_id);
203192

204193
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
205194
}

0 commit comments

Comments
 (0)