Skip to content

Commit b2aa6b1

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: umc v12_0 converts error address
Umc v12_0 converts error address. Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 95b4063 commit b2aa6b1

File tree

2 files changed

+105
-1
lines changed

2 files changed

+105
-1
lines changed

drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
222222
}
223223
}
224224

225+
static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
226+
struct ta_ras_query_address_input *addr_in,
227+
uint64_t *pfns, int len)
228+
{
229+
uint32_t col, row, row_xor, bank, channel_index;
230+
uint64_t soc_pa, retired_page, column, err_addr;
231+
struct ta_ras_query_address_output addr_out;
232+
uint32_t pos = 0;
233+
234+
err_addr = addr_in->ma.err_addr;
235+
addr_in->addr_type = TA_RAS_MCA_TO_PA;
236+
if (psp_ras_query_address(&adev->psp, addr_in, &addr_out)) {
237+
dev_warn(adev->dev, "Failed to query RAS physical address for 0x%llx",
238+
err_addr);
239+
return 0;
240+
}
241+
242+
soc_pa = addr_out.pa.pa;
243+
bank = addr_out.pa.bank;
244+
channel_index = addr_out.pa.channel_idx;
245+
246+
col = (err_addr >> 1) & 0x1fULL;
247+
row = (err_addr >> 10) & 0x3fffULL;
248+
row_xor = row ^ (0x1ULL << 13);
249+
/* clear [C3 C2] in soc physical address */
250+
soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
251+
/* clear [C4] in soc physical address */
252+
soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
253+
254+
/* loop for all possibilities of [C4 C3 C2] */
255+
for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
256+
retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
257+
retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
258+
259+
if (pos >= len)
260+
return 0;
261+
pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
262+
263+
/* include column bit 0 and 1 */
264+
col &= 0x3;
265+
col |= (column << 2);
266+
dev_info(adev->dev,
267+
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
268+
retired_page, row, col, bank, channel_index);
269+
270+
/* shift R13 bit */
271+
retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
272+
273+
if (pos >= len)
274+
return 0;
275+
pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
276+
277+
dev_info(adev->dev,
278+
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
279+
retired_page, row_xor, col, bank, channel_index);
280+
}
281+
282+
return pos;
283+
}
284+
225285
static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
226286
uint32_t node_inst, uint32_t umc_inst,
227287
uint32_t ch_inst, void *data)
@@ -482,8 +542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, struct ras_common
482542
static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
483543
uint64_t status, uint64_t ipid, uint64_t addr)
484544
{
485-
uint16_t hwid, mcatype;
486545
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
546+
uint16_t hwid, mcatype;
547+
struct ta_ras_query_address_input addr_in;
548+
uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
549+
uint64_t err_addr;
550+
int count;
487551

488552
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
489553
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
@@ -497,6 +561,34 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
497561
if (!umc_v12_0_is_deferred_error(adev, status))
498562
return 0;
499563

564+
err_addr = REG_GET_FIELD(addr,
565+
MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr);
566+
567+
dev_info(adev->dev,
568+
"UMC:IPID:0x%llx, socket:%llu, aid:%llu, inst:%llu, ch:%llu, err_addr:0x%llx\n",
569+
ipid,
570+
MCA_IPID_2_SOCKET_ID(ipid),
571+
MCA_IPID_2_DIE_ID(ipid),
572+
MCA_IPID_2_UMC_INST(ipid),
573+
MCA_IPID_2_UMC_CH(ipid),
574+
err_addr);
575+
576+
memset(page_pfn, 0, sizeof(page_pfn));
577+
578+
memset(&addr_in, 0, sizeof(addr_in));
579+
addr_in.ma.err_addr = err_addr;
580+
addr_in.ma.ch_inst = MCA_IPID_2_UMC_CH(ipid);
581+
addr_in.ma.umc_inst = MCA_IPID_2_UMC_INST(ipid);
582+
addr_in.ma.node_inst = MCA_IPID_2_DIE_ID(ipid);
583+
addr_in.ma.socket_id = MCA_IPID_2_SOCKET_ID(ipid);
584+
585+
count = umc_v12_0_convert_err_addr(adev,
586+
&addr_in, page_pfn, ARRAY_SIZE(page_pfn));
587+
if (count <= 0) {
588+
dev_warn(adev->dev, "Fail to convert error address! count:%d\n", count);
589+
return 0;
590+
}
591+
500592
con->umc_ecc_log.de_updated = true;
501593

502594
return 0;

drivers/gpu/drm/amd/amdgpu/umc_v12_0.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,18 @@
6969
(((_ipid_lo) >> 12) & 0xF))
7070
#define MCA_IPID_LO_2_UMC_INST(_ipid_lo) (((_ipid_lo) >> 21) & 0x7)
7171

72+
#define MCA_IPID_2_DIE_ID(ipid) ((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) >> 2) & 0x03)
73+
74+
#define MCA_IPID_2_UMC_CH(ipid) \
75+
(MCA_IPID_LO_2_UMC_CH(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo)))
76+
77+
#define MCA_IPID_2_UMC_INST(ipid) \
78+
(MCA_IPID_LO_2_UMC_INST(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo)))
79+
80+
#define MCA_IPID_2_SOCKET_ID(ipid) \
81+
(((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
82+
(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))
83+
7284
bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
7385
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
7486
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);

0 commit comments

Comments
 (0)