Skip to content

Commit 2dd6a04

Browse files
Stanley.YangStanley.Yang
authored andcommitted
drm/amdgpu: message smu to update bad channel info
It should notice SMU to update bad channel info when detected uncorrectable error in UMC block Change-Id: I2dc8848affdb53e52891013953ae9383fff5f20f Signed-off-by: Stanley.Yang <Stanley.Yang@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com>
1 parent 9338d13 commit 2dd6a04

File tree

5 files changed

+42
-2
lines changed

5 files changed

+42
-2
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2066,6 +2066,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
20662066
mutex_init(&con->recovery_lock);
20672067
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
20682068
atomic_set(&con->in_recovery, 0);
2069+
con->eeprom_control.bad_channel_bitmap = 0;
20692070

20702071
max_eeprom_records_count = amdgpu_ras_eeprom_max_record_count();
20712072
amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
@@ -2090,6 +2091,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
20902091
goto free;
20912092

20922093
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
2094+
2095+
if (con->update_channel_flag == true) {
2096+
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
2097+
con->update_channel_flag = false;
2098+
}
20932099
}
20942100

20952101
#ifdef HAVE_SMCA_UMC_V2
@@ -2284,6 +2290,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
22842290
goto release_con;
22852291
}
22862292

2293+
con->update_channel_flag = false;
22872294
con->features = 0;
22882295
INIT_LIST_HEAD(&con->head);
22892296
/* Might need get this flag from vbios. */

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,9 @@ struct amdgpu_ras {
375375

376376
/* record umc error info queried from smu */
377377
struct umc_ecc_info umc_ecc;
378+
379+
/* Indicates smu whether need update bad channel info */
380+
bool update_channel_flag;
378381
};
379382

380383
struct ras_fs_data {

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
265265
{
266266
struct amdgpu_device *adev = to_amdgpu_device(control);
267267
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
268+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
268269
u8 csum;
269270
int res;
270271

@@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct amdgpu_ras_eeprom_control *control)
285286

286287
amdgpu_dpm_send_hbm_bad_pages_num(adev, control->ras_num_recs);
287288

289+
control->bad_channel_bitmap = 0;
290+
amdgpu_dpm_send_hbm_bad_channel_flag(adev, control->bad_channel_bitmap);
291+
con->update_channel_flag = false;
292+
288293
amdgpu_ras_debugfs_set_ret_size(control);
289294

290295
mutex_unlock(&control->ras_tbl_mutex);
@@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
418423
struct eeprom_table_record *record,
419424
const u32 num)
420425
{
426+
struct amdgpu_ras *con = amdgpu_ras_get_context(to_amdgpu_device(control));
421427
u32 a, b, i;
422428
u8 *buf, *pp;
423429
int res;
@@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct amdgpu_ras_eeprom_control *control,
429435
/* Encode all of them in one go.
430436
*/
431437
pp = buf;
432-
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
438+
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
433439
__encode_table_record_to_buf(control, &record[i], pp);
434440

441+
/* update bad channel bitmap */
442+
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
443+
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
444+
con->update_channel_flag = true;
445+
}
446+
}
447+
435448
/* a, first record index to write into.
436449
* b, last record index to write into.
437450
* a = first index to read (fri) + number of records in the table,
@@ -684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
684697
const u32 num)
685698
{
686699
struct amdgpu_device *adev = to_amdgpu_device(control);
700+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
687701
int i, res;
688702
u8 *buf, *pp;
689703
u32 g0, g1;
@@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct amdgpu_ras_eeprom_control *control,
751765
/* Read up everything? Then transform.
752766
*/
753767
pp = buf;
754-
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
768+
for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
755769
__decode_table_record_from_buf(control, &record[i], pp);
770+
771+
/* update bad channel bitmap */
772+
if (!(control->bad_channel_bitmap & (1 << record[i].mem_channel))) {
773+
control->bad_channel_bitmap |= 1 << record[i].mem_channel;
774+
con->update_channel_flag = true;
775+
}
776+
}
756777
Out:
757778
kfree(buf);
758779
mutex_unlock(&control->ras_tbl_mutex);

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
8080
/* Protect table access via this mutex.
8181
*/
8282
struct mutex ras_tbl_mutex;
83+
84+
/* Record channel info which occurred bad pages
85+
*/
86+
u32 bad_channel_bitmap;
8387
};
8488

8589
/*

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
9797
amdgpu_ras_save_bad_pages(adev);
9898

9999
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
100+
101+
if (con->update_channel_flag == true) {
102+
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
103+
con->update_channel_flag = false;
104+
}
100105
}
101106

102107
if (reset)

0 commit comments

Comments
 (0)