Skip to content

Commit fec8c52

Browse files
Tao Zhoualexdeucher
authored andcommitted
drm/amdgpu: save error count in RAS poison handler
Otherwise the RAS error count couldn't be queried from sysfs. Signed-off-by: Tao Zhou <[email protected]> Reviewed-by: Stanley.Yang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 45e3d1d commit fec8c52

File tree

3 files changed

+97
-76
lines changed

3 files changed

+97
-76
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,7 @@ void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bo
727727

728728
/* CPU MCA will handle page retirement if connected_to_cpu is 1 */
729729
if (!adev->gmc.xgmi.connected_to_cpu)
730-
amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset);
730+
amdgpu_umc_poison_handler(adev, &err_data, reset);
731731
else if (reset)
732732
amdgpu_amdkfd_gpu_reset(adev);
733733
}

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

Lines changed: 95 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -23,79 +23,7 @@
2323

2424
#include "amdgpu_ras.h"
2525

26-
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
27-
void *ras_error_status,
28-
struct amdgpu_iv_entry *entry)
29-
{
30-
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
31-
}
32-
33-
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
34-
{
35-
int r;
36-
struct ras_fs_if fs_info = {
37-
.sysfs_name = "umc_err_count",
38-
};
39-
struct ras_ih_if ih_info = {
40-
.cb = amdgpu_umc_process_ras_data_cb,
41-
};
42-
43-
if (!adev->umc.ras_if) {
44-
adev->umc.ras_if =
45-
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
46-
if (!adev->umc.ras_if)
47-
return -ENOMEM;
48-
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
49-
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
50-
adev->umc.ras_if->sub_block_index = 0;
51-
}
52-
ih_info.head = fs_info.head = *adev->umc.ras_if;
53-
54-
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
55-
&fs_info, &ih_info);
56-
if (r)
57-
goto free;
58-
59-
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
60-
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
61-
if (r)
62-
goto late_fini;
63-
} else {
64-
r = 0;
65-
goto free;
66-
}
67-
68-
/* ras init of specific umc version */
69-
if (adev->umc.ras_funcs &&
70-
adev->umc.ras_funcs->err_cnt_init)
71-
adev->umc.ras_funcs->err_cnt_init(adev);
72-
73-
return 0;
74-
75-
late_fini:
76-
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
77-
free:
78-
kfree(adev->umc.ras_if);
79-
adev->umc.ras_if = NULL;
80-
return r;
81-
}
82-
83-
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
84-
{
85-
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
86-
adev->umc.ras_if) {
87-
struct ras_common_if *ras_if = adev->umc.ras_if;
88-
struct ras_ih_if ih_info = {
89-
.head = *ras_if,
90-
.cb = amdgpu_umc_process_ras_data_cb,
91-
};
92-
93-
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
94-
kfree(ras_if);
95-
}
96-
}
97-
98-
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
26+
static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
9927
void *ras_error_status,
10028
struct amdgpu_iv_entry *entry,
10129
bool reset)
@@ -180,6 +108,100 @@ int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
180108
return AMDGPU_RAS_SUCCESS;
181109
}
182110

111+
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
112+
void *ras_error_status,
113+
bool reset)
114+
{
115+
int ret;
116+
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
117+
struct ras_common_if head = {
118+
.block = AMDGPU_RAS_BLOCK__UMC,
119+
};
120+
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head);
121+
122+
ret =
123+
amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, reset);
124+
125+
if (ret == AMDGPU_RAS_SUCCESS && obj) {
126+
obj->err_data.ue_count += err_data->ue_count;
127+
obj->err_data.ce_count += err_data->ce_count;
128+
}
129+
130+
return ret;
131+
}
132+
133+
static int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
134+
void *ras_error_status,
135+
struct amdgpu_iv_entry *entry)
136+
{
137+
return amdgpu_umc_do_page_retirement(adev, ras_error_status, entry, true);
138+
}
139+
140+
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev)
141+
{
142+
int r;
143+
struct ras_fs_if fs_info = {
144+
.sysfs_name = "umc_err_count",
145+
};
146+
struct ras_ih_if ih_info = {
147+
.cb = amdgpu_umc_process_ras_data_cb,
148+
};
149+
150+
if (!adev->umc.ras_if) {
151+
adev->umc.ras_if =
152+
kmalloc(sizeof(struct ras_common_if), GFP_KERNEL);
153+
if (!adev->umc.ras_if)
154+
return -ENOMEM;
155+
adev->umc.ras_if->block = AMDGPU_RAS_BLOCK__UMC;
156+
adev->umc.ras_if->type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
157+
adev->umc.ras_if->sub_block_index = 0;
158+
}
159+
ih_info.head = fs_info.head = *adev->umc.ras_if;
160+
161+
r = amdgpu_ras_late_init(adev, adev->umc.ras_if,
162+
&fs_info, &ih_info);
163+
if (r)
164+
goto free;
165+
166+
if (amdgpu_ras_is_supported(adev, adev->umc.ras_if->block)) {
167+
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
168+
if (r)
169+
goto late_fini;
170+
} else {
171+
r = 0;
172+
goto free;
173+
}
174+
175+
/* ras init of specific umc version */
176+
if (adev->umc.ras_funcs &&
177+
adev->umc.ras_funcs->err_cnt_init)
178+
adev->umc.ras_funcs->err_cnt_init(adev);
179+
180+
return 0;
181+
182+
late_fini:
183+
amdgpu_ras_late_fini(adev, adev->umc.ras_if, &ih_info);
184+
free:
185+
kfree(adev->umc.ras_if);
186+
adev->umc.ras_if = NULL;
187+
return r;
188+
}
189+
190+
void amdgpu_umc_ras_fini(struct amdgpu_device *adev)
191+
{
192+
if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC) &&
193+
adev->umc.ras_if) {
194+
struct ras_common_if *ras_if = adev->umc.ras_if;
195+
struct ras_ih_if ih_info = {
196+
.head = *ras_if,
197+
.cb = amdgpu_umc_process_ras_data_cb,
198+
};
199+
200+
amdgpu_ras_late_fini(adev, ras_if, &ih_info);
201+
kfree(ras_if);
202+
}
203+
}
204+
183205
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
184206
struct amdgpu_irq_src *source,
185207
struct amdgpu_iv_entry *entry)

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -78,9 +78,8 @@ struct amdgpu_umc {
7878

7979
int amdgpu_umc_ras_late_init(struct amdgpu_device *adev);
8080
void amdgpu_umc_ras_fini(struct amdgpu_device *adev);
81-
int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
81+
int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
8282
void *ras_error_status,
83-
struct amdgpu_iv_entry *entry,
8483
bool reset);
8584
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
8685
struct amdgpu_irq_src *source,

0 commit comments

Comments
 (0)