@@ -152,8 +152,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
152
152
153
153
static int amdgpu_reserve_page_direct (struct amdgpu_device * adev , uint64_t address )
154
154
{
155
- struct ras_err_data err_data = { 0 , 0 , 0 , NULL } ;
155
+ struct ras_err_data err_data ;
156
156
struct eeprom_table_record err_rec ;
157
+ int ret ;
157
158
158
159
if ((address >= adev -> gmc .mc_vram_size ) ||
159
160
(address >= RAS_UMC_INJECT_ADDR_LIMIT )) {
@@ -170,6 +171,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
170
171
return 0 ;
171
172
}
172
173
174
+ ret = amdgpu_ras_error_data_init (& err_data );
175
+ if (ret )
176
+ return ret ;
177
+
173
178
memset (& err_rec , 0x0 , sizeof (struct eeprom_table_record ));
174
179
err_data .err_addr = & err_rec ;
175
180
amdgpu_umc_fill_error_record (& err_data , address , address , 0 , 0 );
@@ -180,6 +185,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
180
185
amdgpu_ras_save_bad_pages (adev , NULL );
181
186
}
182
187
188
+ amdgpu_ras_error_data_fini (& err_data );
189
+
183
190
dev_warn (adev -> dev , "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n" );
184
191
dev_warn (adev -> dev , "Clear EEPROM:\n" );
185
192
dev_warn (adev -> dev , " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n" );
@@ -1015,25 +1022,127 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
1015
1022
}
1016
1023
}
1017
1024
1025
+ static void amdgpu_ras_error_print_error_data (struct amdgpu_device * adev ,
1026
+ struct ras_query_if * query_if ,
1027
+ struct ras_err_data * err_data ,
1028
+ bool is_ue )
1029
+ {
1030
+ struct ras_manager * ras_mgr = amdgpu_ras_find_obj (adev , & query_if -> head );
1031
+ const char * blk_name = get_ras_block_str (& query_if -> head );
1032
+ struct amdgpu_smuio_mcm_config_info * mcm_info ;
1033
+ struct ras_err_node * err_node ;
1034
+ struct ras_err_info * err_info ;
1035
+
1036
+ if (is_ue )
1037
+ dev_info (adev -> dev , "%ld uncorrectable hardware errors detected in %s block\n" ,
1038
+ ras_mgr -> err_data .ue_count , blk_name );
1039
+ else
1040
+ dev_info (adev -> dev , "%ld correctable hardware errors detected in %s block\n" ,
1041
+ ras_mgr -> err_data .ue_count , blk_name );
1042
+
1043
+ for_each_ras_error (err_node , err_data ) {
1044
+ err_info = & err_node -> err_info ;
1045
+ mcm_info = & err_info -> mcm_info ;
1046
+ if (is_ue && err_info -> ue_count ) {
1047
+ dev_info (adev -> dev , "socket: %d, die: %d "
1048
+ "%lld uncorrectable hardware errors detected in %s block\n" ,
1049
+ mcm_info -> socket_id ,
1050
+ mcm_info -> die_id ,
1051
+ err_info -> ue_count ,
1052
+ blk_name );
1053
+ } else if (!is_ue && err_info -> ce_count ) {
1054
+ dev_info (adev -> dev , "socket: %d, die: %d "
1055
+ "%lld correctable hardware errors detected in %s block\n" ,
1056
+ mcm_info -> socket_id ,
1057
+ mcm_info -> die_id ,
1058
+ err_info -> ue_count ,
1059
+ blk_name );
1060
+ }
1061
+ }
1062
+ }
1063
+
1064
+ static void amdgpu_ras_error_generate_report (struct amdgpu_device * adev ,
1065
+ struct ras_query_if * query_if ,
1066
+ struct ras_err_data * err_data )
1067
+ {
1068
+ struct ras_manager * ras_mgr = amdgpu_ras_find_obj (adev , & query_if -> head );
1069
+ const char * blk_name = get_ras_block_str (& query_if -> head );
1070
+
1071
+ if (err_data -> ce_count ) {
1072
+ if (!list_empty (& err_data -> err_node_list )) {
1073
+ amdgpu_ras_error_print_error_data (adev , query_if ,
1074
+ err_data , false);
1075
+ } else if (!adev -> aid_mask &&
1076
+ adev -> smuio .funcs &&
1077
+ adev -> smuio .funcs -> get_socket_id &&
1078
+ adev -> smuio .funcs -> get_die_id ) {
1079
+ dev_info (adev -> dev , "socket: %d, die: %d "
1080
+ "%ld correctable hardware errors "
1081
+ "detected in %s block, no user "
1082
+ "action is needed.\n" ,
1083
+ adev -> smuio .funcs -> get_socket_id (adev ),
1084
+ adev -> smuio .funcs -> get_die_id (adev ),
1085
+ ras_mgr -> err_data .ce_count ,
1086
+ blk_name );
1087
+ } else {
1088
+ dev_info (adev -> dev , "%ld correctable hardware errors "
1089
+ "detected in %s block, no user "
1090
+ "action is needed.\n" ,
1091
+ ras_mgr -> err_data .ce_count ,
1092
+ blk_name );
1093
+ }
1094
+ }
1095
+
1096
+ if (err_data -> ue_count ) {
1097
+ if (!list_empty (& err_data -> err_node_list )) {
1098
+ amdgpu_ras_error_print_error_data (adev , query_if ,
1099
+ err_data , true);
1100
+ } else if (!adev -> aid_mask &&
1101
+ adev -> smuio .funcs &&
1102
+ adev -> smuio .funcs -> get_socket_id &&
1103
+ adev -> smuio .funcs -> get_die_id ) {
1104
+ dev_info (adev -> dev , "socket: %d, die: %d "
1105
+ "%ld uncorrectable hardware errors "
1106
+ "detected in %s block\n" ,
1107
+ adev -> smuio .funcs -> get_socket_id (adev ),
1108
+ adev -> smuio .funcs -> get_die_id (adev ),
1109
+ ras_mgr -> err_data .ue_count ,
1110
+ blk_name );
1111
+ } else {
1112
+ dev_info (adev -> dev , "%ld uncorrectable hardware errors "
1113
+ "detected in %s block\n" ,
1114
+ ras_mgr -> err_data .ue_count ,
1115
+ blk_name );
1116
+ }
1117
+ }
1118
+
1119
+ }
1120
+
1018
1121
/* query/inject/cure begin */
1019
1122
int amdgpu_ras_query_error_status (struct amdgpu_device * adev ,
1020
1123
struct ras_query_if * info )
1021
1124
{
1022
1125
struct amdgpu_ras_block_object * block_obj = NULL ;
1023
1126
struct ras_manager * obj = amdgpu_ras_find_obj (adev , & info -> head );
1024
- struct ras_err_data err_data = {0 , 0 , 0 , NULL };
1127
+ struct ras_err_data err_data ;
1128
+ int ret ;
1025
1129
1026
1130
if (!obj )
1027
1131
return - EINVAL ;
1028
1132
1133
+ ret = amdgpu_ras_error_data_init (& err_data );
1134
+ if (ret )
1135
+ return ret ;
1136
+
1029
1137
if (info -> head .block == AMDGPU_RAS_BLOCK__UMC ) {
1030
1138
amdgpu_ras_get_ecc_info (adev , & err_data );
1031
1139
} else {
1032
1140
block_obj = amdgpu_ras_get_ras_block (adev , info -> head .block , 0 );
1033
1141
if (!block_obj || !block_obj -> hw_ops ) {
1034
1142
dev_dbg_once (adev -> dev , "%s doesn't config RAS function\n" ,
1035
1143
get_ras_block_str (& info -> head ));
1036
- return - EINVAL ;
1144
+ ret = - EINVAL ;
1145
+ goto out_fini_err_data ;
1037
1146
}
1038
1147
1039
1148
if (block_obj -> hw_ops -> query_ras_error_count )
@@ -1053,48 +1162,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
1053
1162
info -> ue_count = obj -> err_data .ue_count ;
1054
1163
info -> ce_count = obj -> err_data .ce_count ;
1055
1164
1056
- if (err_data .ce_count ) {
1057
- if (!adev -> aid_mask &&
1058
- adev -> smuio .funcs &&
1059
- adev -> smuio .funcs -> get_socket_id &&
1060
- adev -> smuio .funcs -> get_die_id ) {
1061
- dev_info (adev -> dev , "socket: %d, die: %d "
1062
- "%ld correctable hardware errors "
1063
- "detected in %s block, no user "
1064
- "action is needed.\n" ,
1065
- adev -> smuio .funcs -> get_socket_id (adev ),
1066
- adev -> smuio .funcs -> get_die_id (adev ),
1067
- obj -> err_data .ce_count ,
1068
- get_ras_block_str (& info -> head ));
1069
- } else {
1070
- dev_info (adev -> dev , "%ld correctable hardware errors "
1071
- "detected in %s block, no user "
1072
- "action is needed.\n" ,
1073
- obj -> err_data .ce_count ,
1074
- get_ras_block_str (& info -> head ));
1075
- }
1076
- }
1077
- if (err_data .ue_count ) {
1078
- if (!adev -> aid_mask &&
1079
- adev -> smuio .funcs &&
1080
- adev -> smuio .funcs -> get_socket_id &&
1081
- adev -> smuio .funcs -> get_die_id ) {
1082
- dev_info (adev -> dev , "socket: %d, die: %d "
1083
- "%ld uncorrectable hardware errors "
1084
- "detected in %s block\n" ,
1085
- adev -> smuio .funcs -> get_socket_id (adev ),
1086
- adev -> smuio .funcs -> get_die_id (adev ),
1087
- obj -> err_data .ue_count ,
1088
- get_ras_block_str (& info -> head ));
1089
- } else {
1090
- dev_info (adev -> dev , "%ld uncorrectable hardware errors "
1091
- "detected in %s block\n" ,
1092
- obj -> err_data .ue_count ,
1093
- get_ras_block_str (& info -> head ));
1094
- }
1095
- }
1165
+ amdgpu_ras_error_generate_report (adev , info , & err_data );
1096
1166
1097
- return 0 ;
1167
+ out_fini_err_data :
1168
+ amdgpu_ras_error_data_fini (& err_data );
1169
+
1170
+ return ret ;
1098
1171
}
1099
1172
1100
1173
int amdgpu_ras_reset_error_status (struct amdgpu_device * adev ,
@@ -1744,12 +1817,16 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
1744
1817
struct amdgpu_iv_entry * entry )
1745
1818
{
1746
1819
struct ras_ih_data * data = & obj -> ih_data ;
1747
- struct ras_err_data err_data = { 0 , 0 , 0 , NULL } ;
1820
+ struct ras_err_data err_data ;
1748
1821
int ret ;
1749
1822
1750
1823
if (!data -> cb )
1751
1824
return ;
1752
1825
1826
+ ret = amdgpu_ras_error_data_init (& err_data );
1827
+ if (ret )
1828
+ return ;
1829
+
1753
1830
/* Let IP handle its data, maybe we need get the output
1754
1831
* from the callback to update the error type/count, etc
1755
1832
*/
@@ -1766,6 +1843,8 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
1766
1843
obj -> err_data .ue_count += err_data .ue_count ;
1767
1844
obj -> err_data .ce_count += err_data .ce_count ;
1768
1845
}
1846
+
1847
+ amdgpu_ras_error_data_fini (& err_data );
1769
1848
}
1770
1849
1771
1850
static void amdgpu_ras_interrupt_handler (struct ras_manager * obj )
@@ -3383,3 +3462,128 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
3383
3462
WREG32 (err_status_hi_offset , 0 );
3384
3463
}
3385
3464
}
3465
+
3466
+ int amdgpu_ras_error_data_init (struct ras_err_data * err_data )
3467
+ {
3468
+ memset (err_data , 0 , sizeof (* err_data ));
3469
+
3470
+ INIT_LIST_HEAD (& err_data -> err_node_list );
3471
+
3472
+ return 0 ;
3473
+ }
3474
+
3475
+ static void amdgpu_ras_error_node_release (struct ras_err_node * err_node )
3476
+ {
3477
+ if (!err_node )
3478
+ return ;
3479
+
3480
+ list_del (& err_node -> node );
3481
+ kvfree (err_node );
3482
+ }
3483
+
3484
+ void amdgpu_ras_error_data_fini (struct ras_err_data * err_data )
3485
+ {
3486
+ struct ras_err_node * err_node , * tmp ;
3487
+
3488
+ list_for_each_entry_safe (err_node , tmp , & err_data -> err_node_list , node ) {
3489
+ amdgpu_ras_error_node_release (err_node );
3490
+ list_del (& err_node -> node );
3491
+ }
3492
+ }
3493
+
3494
+ static struct ras_err_node * amdgpu_ras_error_find_node_by_id (struct ras_err_data * err_data ,
3495
+ struct amdgpu_smuio_mcm_config_info * mcm_info )
3496
+ {
3497
+ struct ras_err_node * err_node ;
3498
+ struct amdgpu_smuio_mcm_config_info * ref_id ;
3499
+
3500
+ if (!err_data || !mcm_info )
3501
+ return NULL ;
3502
+
3503
+ for_each_ras_error (err_node , err_data ) {
3504
+ ref_id = & err_node -> err_info .mcm_info ;
3505
+ if ((mcm_info -> socket_id >= 0 && mcm_info -> socket_id != ref_id -> socket_id ) ||
3506
+ (mcm_info -> die_id >= 0 && mcm_info -> die_id != ref_id -> die_id ))
3507
+ continue ;
3508
+
3509
+ return err_node ;
3510
+ }
3511
+
3512
+ return NULL ;
3513
+ }
3514
+
3515
+ static struct ras_err_node * amdgpu_ras_error_node_new (void )
3516
+ {
3517
+ struct ras_err_node * err_node ;
3518
+
3519
+ err_node = kvzalloc (sizeof (* err_node ), GFP_KERNEL );
3520
+ if (!err_node )
3521
+ return NULL ;
3522
+
3523
+ INIT_LIST_HEAD (& err_node -> node );
3524
+
3525
+ return err_node ;
3526
+ }
3527
+
3528
+ static struct ras_err_info * amdgpu_ras_error_get_info (struct ras_err_data * err_data ,
3529
+ struct amdgpu_smuio_mcm_config_info * mcm_info )
3530
+ {
3531
+ struct ras_err_node * err_node ;
3532
+
3533
+ err_node = amdgpu_ras_error_find_node_by_id (err_data , mcm_info );
3534
+ if (err_node )
3535
+ return & err_node -> err_info ;
3536
+
3537
+ err_node = amdgpu_ras_error_node_new ();
3538
+ if (!err_node )
3539
+ return NULL ;
3540
+
3541
+ memcpy (& err_node -> err_info .mcm_info , mcm_info , sizeof (* mcm_info ));
3542
+
3543
+ err_data -> err_list_count ++ ;
3544
+ list_add_tail (& err_node -> node , & err_data -> err_node_list );
3545
+
3546
+ return & err_node -> err_info ;
3547
+ }
3548
+
3549
+ int amdgpu_ras_error_statistic_ue_count (struct ras_err_data * err_data ,
3550
+ struct amdgpu_smuio_mcm_config_info * mcm_info , u64 count )
3551
+ {
3552
+ struct ras_err_info * err_info ;
3553
+
3554
+ if (!err_data || !mcm_info )
3555
+ return - EINVAL ;
3556
+
3557
+ if (!count )
3558
+ return 0 ;
3559
+
3560
+ err_info = amdgpu_ras_error_get_info (err_data , mcm_info );
3561
+ if (!err_info )
3562
+ return - EINVAL ;
3563
+
3564
+ err_info -> ue_count += count ;
3565
+ err_data -> ue_count += count ;
3566
+
3567
+ return 0 ;
3568
+ }
3569
+
3570
+ int amdgpu_ras_error_statistic_ce_count (struct ras_err_data * err_data ,
3571
+ struct amdgpu_smuio_mcm_config_info * mcm_info , u64 count )
3572
+ {
3573
+ struct ras_err_info * err_info ;
3574
+
3575
+ if (!err_data || !mcm_info )
3576
+ return - EINVAL ;
3577
+
3578
+ if (!count )
3579
+ return 0 ;
3580
+
3581
+ err_info = amdgpu_ras_error_get_info (err_data , mcm_info );
3582
+ if (!err_info )
3583
+ return - EINVAL ;
3584
+
3585
+ err_info -> ce_count += count ;
3586
+ err_data -> ce_count += count ;
3587
+
3588
+ return 0 ;
3589
+ }
0 commit comments