@@ -6019,16 +6019,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
6019
6019
return ret ;
6020
6020
}
6021
6021
6022
- static int amdgpu_device_halt_activities (struct amdgpu_device * adev ,
6023
- struct amdgpu_job * job ,
6024
- struct amdgpu_reset_context * reset_context ,
6025
- struct list_head * device_list ,
6026
- struct amdgpu_hive_info * hive ,
6027
- bool need_emergency_restart )
6022
+ static int amdgpu_device_recovery_prepare (struct amdgpu_device * adev ,
6023
+ struct list_head * device_list ,
6024
+ struct amdgpu_hive_info * hive )
6028
6025
{
6029
- struct list_head * device_list_handle = NULL ;
6030
6026
struct amdgpu_device * tmp_adev = NULL ;
6031
- int i , r = 0 ;
6027
+ int r ;
6032
6028
6033
6029
/*
6034
6030
* Build list of devices to reset.
@@ -6045,26 +6041,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
6045
6041
}
6046
6042
if (!list_is_first (& adev -> reset_list , device_list ))
6047
6043
list_rotate_to_front (& adev -> reset_list , device_list );
6048
- device_list_handle = device_list ;
6049
6044
} else {
6050
6045
list_add_tail (& adev -> reset_list , device_list );
6051
- device_list_handle = device_list ;
6052
6046
}
6053
6047
6054
6048
if (!amdgpu_sriov_vf (adev ) && (!adev -> pcie_reset_ctx .occurs_dpc )) {
6055
- r = amdgpu_device_health_check (device_list_handle );
6049
+ r = amdgpu_device_health_check (device_list );
6056
6050
if (r )
6057
6051
return r ;
6058
6052
}
6059
6053
6060
- /* We need to lock reset domain only once both for XGMI and single device */
6061
- tmp_adev = list_first_entry (device_list_handle , struct amdgpu_device ,
6062
- reset_list );
6054
+ return 0 ;
6055
+ }
6056
+
6057
+ static void amdgpu_device_recovery_get_reset_lock (struct amdgpu_device * adev ,
6058
+ struct list_head * device_list )
6059
+ {
6060
+ struct amdgpu_device * tmp_adev = NULL ;
6061
+
6062
+ if (list_empty (device_list ))
6063
+ return ;
6064
+ tmp_adev =
6065
+ list_first_entry (device_list , struct amdgpu_device , reset_list );
6063
6066
amdgpu_device_lock_reset_domain (tmp_adev -> reset_domain );
6067
+ }
6064
6068
6065
- /* block all schedulers and reset given job's ring */
6066
- list_for_each_entry (tmp_adev , device_list_handle , reset_list ) {
6069
+ static void amdgpu_device_recovery_put_reset_lock (struct amdgpu_device * adev ,
6070
+ struct list_head * device_list )
6071
+ {
6072
+ struct amdgpu_device * tmp_adev = NULL ;
6067
6073
6074
+ if (list_empty (device_list ))
6075
+ return ;
6076
+ tmp_adev =
6077
+ list_first_entry (device_list , struct amdgpu_device , reset_list );
6078
+ amdgpu_device_unlock_reset_domain (tmp_adev -> reset_domain );
6079
+ }
6080
+
6081
+ static int amdgpu_device_halt_activities (
6082
+ struct amdgpu_device * adev , struct amdgpu_job * job ,
6083
+ struct amdgpu_reset_context * reset_context ,
6084
+ struct list_head * device_list , struct amdgpu_hive_info * hive ,
6085
+ bool need_emergency_restart )
6086
+ {
6087
+ struct amdgpu_device * tmp_adev = NULL ;
6088
+ int i , r = 0 ;
6089
+
6090
+ /* block all schedulers and reset given job's ring */
6091
+ list_for_each_entry (tmp_adev , device_list , reset_list ) {
6068
6092
amdgpu_device_set_mp1_state (tmp_adev );
6069
6093
6070
6094
/*
@@ -6252,11 +6276,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
6252
6276
amdgpu_ras_set_error_query_ready (tmp_adev , true);
6253
6277
6254
6278
}
6255
-
6256
- tmp_adev = list_first_entry (device_list , struct amdgpu_device ,
6257
- reset_list );
6258
- amdgpu_device_unlock_reset_domain (tmp_adev -> reset_domain );
6259
-
6260
6279
}
6261
6280
6262
6281
@@ -6324,10 +6343,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
6324
6343
reset_context -> hive = hive ;
6325
6344
INIT_LIST_HEAD (& device_list );
6326
6345
6346
+ if (amdgpu_device_recovery_prepare (adev , & device_list , hive ))
6347
+ goto end_reset ;
6348
+
6349
+ /* We need to lock reset domain only once both for XGMI and single device */
6350
+ amdgpu_device_recovery_get_reset_lock (adev , & device_list );
6351
+
6327
6352
r = amdgpu_device_halt_activities (adev , job , reset_context , & device_list ,
6328
6353
hive , need_emergency_restart );
6329
6354
if (r )
6330
- goto end_reset ;
6355
+ goto reset_unlock ;
6331
6356
6332
6357
if (need_emergency_restart )
6333
6358
goto skip_sched_resume ;
@@ -6345,13 +6370,15 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
6345
6370
6346
6371
r = amdgpu_device_asic_reset (adev , & device_list , reset_context );
6347
6372
if (r )
6348
- goto end_reset ;
6373
+ goto reset_unlock ;
6349
6374
skip_hw_reset :
6350
6375
r = amdgpu_device_sched_resume (& device_list , reset_context , job_signaled );
6351
6376
if (r )
6352
- goto end_reset ;
6377
+ goto reset_unlock ;
6353
6378
skip_sched_resume :
6354
6379
amdgpu_device_gpu_resume (adev , & device_list , need_emergency_restart );
6380
+ reset_unlock :
6381
+ amdgpu_device_recovery_put_reset_lock (adev , & device_list );
6355
6382
end_reset :
6356
6383
if (hive ) {
6357
6384
mutex_unlock (& hive -> hive_lock );
@@ -6763,6 +6790,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
6763
6790
memset (& reset_context , 0 , sizeof (reset_context ));
6764
6791
INIT_LIST_HEAD (& device_list );
6765
6792
6793
+ amdgpu_device_recovery_prepare (adev , & device_list , hive );
6794
+ amdgpu_device_recovery_get_reset_lock (adev , & device_list );
6766
6795
r = amdgpu_device_halt_activities (adev , NULL , & reset_context , & device_list ,
6767
6796
hive , false);
6768
6797
if (hive ) {
@@ -6880,8 +6909,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
6880
6909
if (hive ) {
6881
6910
list_for_each_entry (tmp_adev , & device_list , reset_list )
6882
6911
amdgpu_device_unset_mp1_state (tmp_adev );
6883
- amdgpu_device_unlock_reset_domain (adev -> reset_domain );
6884
6912
}
6913
+ amdgpu_device_recovery_put_reset_lock (adev , & device_list );
6885
6914
}
6886
6915
6887
6916
if (hive ) {
@@ -6927,6 +6956,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
6927
6956
6928
6957
amdgpu_device_sched_resume (& device_list , NULL , NULL );
6929
6958
amdgpu_device_gpu_resume (adev , & device_list , false);
6959
+ amdgpu_device_recovery_put_reset_lock (adev , & device_list );
6930
6960
adev -> pcie_reset_ctx .occurs_dpc = false;
6931
6961
6932
6962
if (hive ) {
0 commit comments