Skip to content

Commit 785c536

Browse files
Lijo Lazaralexdeucher
authored andcommitted
drm/amdgpu: Release reset locks during failures
Make sure to release reset domain lock in case of failures. Signed-off-by: Lijo Lazar <[email protected]> Signed-off-by: Ce Sun <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Fixes: 11bb337 ("drm/amdgpu: refactor amdgpu_device_gpu_recover") Signed-off-by: Alex Deucher <[email protected]> (cherry picked from commit 1ab11a8)
1 parent b669507 commit 785c536

File tree

1 file changed

+55
-25
lines changed

1 file changed

+55
-25
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 55 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6019,16 +6019,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
60196019
return ret;
60206020
}
60216021

6022-
static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
6023-
struct amdgpu_job *job,
6024-
struct amdgpu_reset_context *reset_context,
6025-
struct list_head *device_list,
6026-
struct amdgpu_hive_info *hive,
6027-
bool need_emergency_restart)
6022+
static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
6023+
struct list_head *device_list,
6024+
struct amdgpu_hive_info *hive)
60286025
{
6029-
struct list_head *device_list_handle = NULL;
60306026
struct amdgpu_device *tmp_adev = NULL;
6031-
int i, r = 0;
6027+
int r;
60326028

60336029
/*
60346030
* Build list of devices to reset.
@@ -6045,26 +6041,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
60456041
}
60466042
if (!list_is_first(&adev->reset_list, device_list))
60476043
list_rotate_to_front(&adev->reset_list, device_list);
6048-
device_list_handle = device_list;
60496044
} else {
60506045
list_add_tail(&adev->reset_list, device_list);
6051-
device_list_handle = device_list;
60526046
}
60536047

60546048
if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
6055-
r = amdgpu_device_health_check(device_list_handle);
6049+
r = amdgpu_device_health_check(device_list);
60566050
if (r)
60576051
return r;
60586052
}
60596053

6060-
/* We need to lock reset domain only once both for XGMI and single device */
6061-
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6062-
reset_list);
6054+
return 0;
6055+
}
6056+
6057+
static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
6058+
struct list_head *device_list)
6059+
{
6060+
struct amdgpu_device *tmp_adev = NULL;
6061+
6062+
if (list_empty(device_list))
6063+
return;
6064+
tmp_adev =
6065+
list_first_entry(device_list, struct amdgpu_device, reset_list);
60636066
amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6067+
}
60646068

6065-
/* block all schedulers and reset given job's ring */
6066-
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6069+
static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
6070+
struct list_head *device_list)
6071+
{
6072+
struct amdgpu_device *tmp_adev = NULL;
60676073

6074+
if (list_empty(device_list))
6075+
return;
6076+
tmp_adev =
6077+
list_first_entry(device_list, struct amdgpu_device, reset_list);
6078+
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6079+
}
6080+
6081+
static int amdgpu_device_halt_activities(
6082+
struct amdgpu_device *adev, struct amdgpu_job *job,
6083+
struct amdgpu_reset_context *reset_context,
6084+
struct list_head *device_list, struct amdgpu_hive_info *hive,
6085+
bool need_emergency_restart)
6086+
{
6087+
struct amdgpu_device *tmp_adev = NULL;
6088+
int i, r = 0;
6089+
6090+
/* block all schedulers and reset given job's ring */
6091+
list_for_each_entry(tmp_adev, device_list, reset_list) {
60686092
amdgpu_device_set_mp1_state(tmp_adev);
60696093

60706094
/*
@@ -6252,11 +6276,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
62526276
amdgpu_ras_set_error_query_ready(tmp_adev, true);
62536277

62546278
}
6255-
6256-
tmp_adev = list_first_entry(device_list, struct amdgpu_device,
6257-
reset_list);
6258-
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6259-
62606279
}
62616280

62626281

@@ -6324,10 +6343,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
63246343
reset_context->hive = hive;
63256344
INIT_LIST_HEAD(&device_list);
63266345

6346+
if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
6347+
goto end_reset;
6348+
6349+
/* We need to lock reset domain only once both for XGMI and single device */
6350+
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6351+
63276352
r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
63286353
hive, need_emergency_restart);
63296354
if (r)
6330-
goto end_reset;
6355+
goto reset_unlock;
63316356

63326357
if (need_emergency_restart)
63336358
goto skip_sched_resume;
@@ -6345,13 +6370,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
63456370

63466371
r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
63476372
if (r)
6348-
goto end_reset;
6373+
goto reset_unlock;
63496374
skip_hw_reset:
63506375
r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
63516376
if (r)
6352-
goto end_reset;
6377+
goto reset_unlock;
63536378
skip_sched_resume:
63546379
amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
6380+
reset_unlock:
6381+
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
63556382
end_reset:
63566383
if (hive) {
63576384
mutex_unlock(&hive->hive_lock);
@@ -6763,6 +6790,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
67636790
memset(&reset_context, 0, sizeof(reset_context));
67646791
INIT_LIST_HEAD(&device_list);
67656792

6793+
amdgpu_device_recovery_prepare(adev, &device_list, hive);
6794+
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
67666795
r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
67676796
hive, false);
67686797
if (hive) {
@@ -6880,8 +6909,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
68806909
if (hive) {
68816910
list_for_each_entry(tmp_adev, &device_list, reset_list)
68826911
amdgpu_device_unset_mp1_state(tmp_adev);
6883-
amdgpu_device_unlock_reset_domain(adev->reset_domain);
68846912
}
6913+
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
68856914
}
68866915

68876916
if (hive) {
@@ -6927,6 +6956,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
69276956

69286957
amdgpu_device_sched_resume(&device_list, NULL, NULL);
69296958
amdgpu_device_gpu_resume(adev, &device_list, false);
6959+
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
69306960
adev->pcie_reset_ctx.occurs_dpc = false;
69316961

69326962
if (hive) {

0 commit comments

Comments
 (0)