File tree Expand file tree Collapse file tree 1 file changed +15
-5
lines changed
drivers/gpu/drm/amd/amdgpu Expand file tree Collapse file tree 1 file changed +15
-5
lines changed Original file line number Diff line number Diff line change @@ -1424,12 +1424,22 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
1424
1424
{
1425
1425
struct amdgpu_ras * ras =
1426
1426
container_of (work , struct amdgpu_ras , recovery_work );
1427
+ struct amdgpu_device * remote_adev = NULL ;
1428
+ struct amdgpu_device * adev = ras -> adev ;
1429
+ struct list_head device_list , * device_list_handle = NULL ;
1430
+ struct amdgpu_hive_info * hive = amdgpu_get_xgmi_hive (adev , false);
1431
+
1432
+ /* Build list of devices to query RAS related errors */
1433
+ if (hive && adev -> gmc .xgmi .num_physical_nodes > 1 ) {
1434
+ device_list_handle = & hive -> device_list ;
1435
+ } else {
1436
+ list_add_tail (& adev -> gmc .xgmi .head , & device_list );
1437
+ device_list_handle = & device_list ;
1438
+ }
1427
1439
1428
- /*
1429
- * Query and print non zero error counter per IP block for
1430
- * awareness before recovering GPU.
1431
- */
1432
- amdgpu_ras_log_on_err_counter (ras -> adev );
1440
+ list_for_each_entry (remote_adev , device_list_handle , gmc .xgmi .head ) {
1441
+ amdgpu_ras_log_on_err_counter (remote_adev );
1442
+ }
1433
1443
1434
1444
if (amdgpu_device_should_recover_gpu (ras -> adev ))
1435
1445
amdgpu_device_gpu_recover (ras -> adev , 0 );
You can’t perform that action at this time.
0 commit comments