Skip to content

Commit 3907c49

Browse files
John Clementsalexdeucher
authored andcommitted
drm/amdgpu: Add driver infrastructure for MCA RAS
Add MCA specific IP blocks targetting RAS features Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: John Clements <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 3341d30 commit 3907c49

File tree

9 files changed

+388
-2
lines changed

9 files changed

+388
-2
lines changed

drivers/gpu/drm/amd/amdgpu/Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
5858
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
5959
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
6060
amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_hdp.o \
61-
amdgpu_eeprom.o
61+
amdgpu_eeprom.o amdgpu_mca.o
6262

6363
amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
6464

@@ -189,6 +189,10 @@ amdgpu-y += \
189189
amdgpu-y += \
190190
amdgpu_reset.o
191191

192+
# add MCA block
193+
amdgpu-y += \
194+
mca_v3_0.o
195+
192196
# add amdkfd interfaces
193197
amdgpu-y += amdgpu_amdkfd.o
194198

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,7 @@
108108
#include "amdgpu_df.h"
109109
#include "amdgpu_smuio.h"
110110
#include "amdgpu_fdinfo.h"
111+
#include "amdgpu_mca.h"
111112

112113
#define MAX_GPU_INSTANCE 16
113114

@@ -1009,6 +1010,9 @@ struct amdgpu_device {
10091010
/* df */
10101011
struct amdgpu_df df;
10111012

1013+
/* MCA */
1014+
struct amdgpu_mca mca;
1015+
10121016
struct amdgpu_ip_block ip_blocks[AMDGPU_MAX_IP_NUM];
10131017
uint32_t harvest_ip_mask;
10141018
int num_ip_blocks;

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,27 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
471471
return r;
472472
}
473473

474+
if (adev->mca.mp0.ras_funcs &&
475+
adev->mca.mp0.ras_funcs->ras_late_init) {
476+
r = adev->mca.mp0.ras_funcs->ras_late_init(adev);
477+
if (r)
478+
return r;
479+
}
480+
481+
if (adev->mca.mp1.ras_funcs &&
482+
adev->mca.mp1.ras_funcs->ras_late_init) {
483+
r = adev->mca.mp1.ras_funcs->ras_late_init(adev);
484+
if (r)
485+
return r;
486+
}
487+
488+
if (adev->mca.mpio.ras_funcs &&
489+
adev->mca.mpio.ras_funcs->ras_late_init) {
490+
r = adev->mca.mpio.ras_funcs->ras_late_init(adev);
491+
if (r)
492+
return r;
493+
}
494+
474495
return 0;
475496
}
476497

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
/*
2+
* Copyright 2021 Advanced Micro Devices, Inc.
3+
*
4+
* Permission is hereby granted, free of charge, to any person obtaining a
5+
* copy of this software and associated documentation files (the "Software"),
6+
* to deal in the Software without restriction, including without limitation
7+
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8+
* and/or sell copies of the Software, and to permit persons to whom the
9+
* Software is furnished to do so, subject to the following conditions:
10+
*
11+
* The above copyright notice and this permission notice shall be included in
12+
* all copies or substantial portions of the Software.
13+
*
14+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15+
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17+
* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18+
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19+
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20+
* OTHER DEALINGS IN THE SOFTWARE.
21+
*
22+
*/
23+
#include "amdgpu_ras.h"
24+
#include "amdgpu.h"
25+
#include "amdgpu_mca.h"
26+
27+
#include "umc/umc_6_7_0_offset.h"
28+
#include "umc/umc_6_7_0_sh_mask.h"
29+
30+
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
31+
uint64_t mc_status_addr,
32+
unsigned long *error_count)
33+
{
34+
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
35+
36+
if (REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 &&
37+
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 1)
38+
*error_count += 1;
39+
}
40+
41+
void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
42+
uint64_t mc_status_addr,
43+
unsigned long *error_count)
44+
{
45+
uint64_t mc_status = RREG64_PCIE(mc_status_addr * 4);
46+
47+
if ((REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1) &&
48+
(REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Deferred) == 1 ||
49+
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 1 ||
50+
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 ||
51+
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 ||
52+
REG_GET_FIELD(mc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 1))
53+
*error_count += 1;
54+
}
55+
56+
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
57+
uint64_t mc_status_addr)
58+
{
59+
WREG64_PCIE(mc_status_addr * 4, 0x0ULL);
60+
}
61+
62+
void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
63+
uint64_t mc_status_addr,
64+
void *ras_error_status)
65+
{
66+
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
67+
68+
amdgpu_mca_query_correctable_error_count(adev, mc_status_addr, &(err_data->ce_count));
69+
amdgpu_mca_query_uncorrectable_error_count(adev, mc_status_addr, &(err_data->ue_count));
70+
71+
amdgpu_mca_reset_error_count(adev, mc_status_addr);
72+
}
73+
74+
int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
75+
struct amdgpu_mca_ras *mca_dev)
76+
{
77+
int r;
78+
struct ras_ih_if ih_info = {
79+
.cb = NULL,
80+
};
81+
struct ras_fs_if fs_info = {
82+
.sysfs_name = mca_dev->ras_funcs->sysfs_name,
83+
};
84+
85+
if (!mca_dev->ras_if) {
86+
mca_dev->ras_if = kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
87+
if (!mca_dev->ras_if)
88+
return -ENOMEM;
89+
mca_dev->ras_if->block = mca_dev->ras_funcs->ras_block;
90+
mca_dev->ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
91+
mca_dev->ras_if->sub_block_index = 0;
92+
}
93+
ih_info.head = fs_info.head = *mca_dev->ras_if;
94+
r = amdgpu_ras_late_init(adev, mca_dev->ras_if,
95+
&fs_info, &ih_info);
96+
if (r || !amdgpu_ras_is_supported(adev, mca_dev->ras_if->block)) {
97+
kfree(mca_dev->ras_if);
98+
mca_dev->ras_if = NULL;
99+
}
100+
101+
return r;
102+
}
103+
104+
void amdgpu_mca_ras_fini(struct amdgpu_device *adev,
105+
struct amdgpu_mca_ras *mca_dev)
106+
{
107+
struct ras_ih_if ih_info = {
108+
.cb = NULL,
109+
};
110+
111+
if (!mca_dev->ras_if)
112+
return;
113+
114+
amdgpu_ras_late_fini(adev, mca_dev->ras_if, &ih_info);
115+
kfree(mca_dev->ras_if);
116+
mca_dev->ras_if = NULL;
117+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright (C) 2021 Advanced Micro Devices, Inc.
3+
*
4+
* Permission is hereby granted, free of charge, to any person obtaining a
5+
* copy of this software and associated documentation files (the "Software"),
6+
* to deal in the Software without restriction, including without limitation
7+
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8+
* and/or sell copies of the Software, and to permit persons to whom the
9+
* Software is furnished to do so, subject to the following conditions:
10+
*
11+
* The above copyright notice and this permission notice shall be included
12+
* in all copies or substantial portions of the Software.
13+
*
14+
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15+
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16+
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
17+
* THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
18+
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19+
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20+
*/
21+
#ifndef __AMDGPU_MCA_H__
22+
#define __AMDGPU_MCA_H__
23+
24+
struct amdgpu_mca_ras_funcs {
25+
int (*ras_late_init)(struct amdgpu_device *adev);
26+
void (*ras_fini)(struct amdgpu_device *adev);
27+
void (*query_ras_error_count)(struct amdgpu_device *adev,
28+
void *ras_error_status);
29+
void (*query_ras_error_address)(struct amdgpu_device *adev,
30+
void *ras_error_status);
31+
uint32_t ras_block;
32+
const char* sysfs_name;
33+
};
34+
35+
struct amdgpu_mca_ras {
36+
struct ras_common_if *ras_if;
37+
const struct amdgpu_mca_ras_funcs *ras_funcs;
38+
};
39+
40+
struct amdgpu_mca_funcs {
41+
void (*init)(struct amdgpu_device *adev);
42+
};
43+
44+
struct amdgpu_mca {
45+
const struct amdgpu_mca_funcs *funcs;
46+
struct amdgpu_mca_ras mp0;
47+
struct amdgpu_mca_ras mp1;
48+
struct amdgpu_mca_ras mpio;
49+
};
50+
51+
void amdgpu_mca_query_correctable_error_count(struct amdgpu_device *adev,
52+
uint64_t mc_status_addr,
53+
unsigned long *error_count);
54+
55+
void amdgpu_mca_query_uncorrectable_error_count(struct amdgpu_device *adev,
56+
uint64_t mc_status_addr,
57+
unsigned long *error_count);
58+
59+
void amdgpu_mca_reset_error_count(struct amdgpu_device *adev,
60+
uint64_t mc_status_addr);
61+
62+
void amdgpu_mca_query_ras_error_count(struct amdgpu_device *adev,
63+
uint64_t mc_status_addr,
64+
void *ras_error_status);
65+
66+
int amdgpu_mca_ras_late_init(struct amdgpu_device *adev,
67+
struct amdgpu_mca_ras *mca_dev);
68+
69+
void amdgpu_mca_ras_fini(struct amdgpu_device *adev,
70+
struct amdgpu_mca_ras *mca_dev);
71+
72+
#endif

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ enum amdgpu_ras_block {
4949
AMDGPU_RAS_BLOCK__MP0,
5050
AMDGPU_RAS_BLOCK__MP1,
5151
AMDGPU_RAS_BLOCK__FUSE,
52+
AMDGPU_RAS_BLOCK__MPIO,
5253

5354
AMDGPU_RAS_BLOCK__LAST
5455
};
@@ -420,7 +421,7 @@ struct ras_badpage {
420421
/* interfaces for IP */
421422
struct ras_fs_if {
422423
struct ras_common_if head;
423-
char sysfs_name[32];
424+
const char* sysfs_name;
424425
char debugfs_name[32];
425426
};
426427

drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#include "umc_v6_0.h"
5656
#include "umc_v6_7.h"
5757
#include "hdp_v4_0.h"
58+
#include "mca_v3_0.h"
5859

5960
#include "ivsrcid/vmc/irqsrcs_vmc_1_0.h"
6061

@@ -1229,6 +1230,18 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct amdgpu_device *adev)
12291230
adev->hdp.ras_funcs = &hdp_v4_0_ras_funcs;
12301231
}
12311232

1233+
static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)
1234+
{
1235+
switch (adev->asic_type) {
1236+
case CHIP_ALDEBARAN:
1237+
if (!adev->gmc.xgmi.connected_to_cpu)
1238+
adev->mca.funcs = &mca_v3_0_funcs;
1239+
break;
1240+
default:
1241+
break;
1242+
}
1243+
}
1244+
12321245
static int gmc_v9_0_early_init(void *handle)
12331246
{
12341247
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1250,6 +1263,7 @@ static int gmc_v9_0_early_init(void *handle)
12501263
gmc_v9_0_set_mmhub_ras_funcs(adev);
12511264
gmc_v9_0_set_gfxhub_funcs(adev);
12521265
gmc_v9_0_set_hdp_ras_funcs(adev);
1266+
gmc_v9_0_set_mca_funcs(adev);
12531267

12541268
adev->gmc.shared_aperture_start = 0x2000000000000000ULL;
12551269
adev->gmc.shared_aperture_end =
@@ -1461,6 +1475,8 @@ static int gmc_v9_0_sw_init(void *handle)
14611475
adev->gfxhub.funcs->init(adev);
14621476

14631477
adev->mmhub.funcs->init(adev);
1478+
if (adev->mca.funcs)
1479+
adev->mca.funcs->init(adev);
14641480

14651481
spin_lock_init(&adev->gmc.invalidate_lock);
14661482

0 commit comments

Comments
 (0)