Skip to content

Commit cbb8f98

Browse files
John Clementsalexdeucher
authored andcommitted
drm/amdgpu: page retire over debugfs mechanism
added support in RAS debugfs to add bad page for isolated page retirement testing Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: John Clements <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 25315eb commit cbb8f98

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,49 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
9999
return false;
100100
}
101101

102+
static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
103+
{
104+
struct ras_err_data err_data = {0, 0, 0, NULL};
105+
struct eeprom_table_record err_rec;
106+
107+
if ((address >= adev->gmc.mc_vram_size) ||
108+
(address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
109+
dev_warn(adev->dev,
110+
"RAS WARN: input address 0x%llx is invalid.\n",
111+
address);
112+
return -EINVAL;
113+
}
114+
115+
if (amdgpu_ras_check_bad_page(adev, address)) {
116+
dev_warn(adev->dev,
117+
"RAS WARN: 0x%llx has been marked as bad page!\n",
118+
address);
119+
return 0;
120+
}
121+
122+
memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
123+
124+
err_rec.address = address;
125+
err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT;
126+
err_rec.ts = (uint64_t)ktime_get_real_seconds();
127+
err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE;
128+
129+
err_data.err_addr = &err_rec;
130+
err_data.err_addr_cnt = 1;
131+
132+
if (amdgpu_bad_page_threshold != 0) {
133+
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
134+
err_data.err_addr_cnt);
135+
amdgpu_ras_save_bad_pages(adev);
136+
}
137+
138+
dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
139+
dev_warn(adev->dev, "Clear EEPROM:\n");
140+
dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
141+
142+
return 0;
143+
}
144+
102145
static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
103146
size_t size, loff_t *pos)
104147
{
@@ -178,11 +221,25 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
178221
op = 1;
179222
else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
180223
op = 2;
224+
else if (sscanf(str, "retire_page") == 0)
225+
op = 3;
181226
else if (str[0] && str[1] && str[2] && str[3])
182227
/* ascii string, but commands are not matched. */
183228
return -EINVAL;
184229

185230
if (op != -1) {
231+
232+
if (op == 3) {
233+
if (sscanf(str, "%*s %llu", &address) != 1)
234+
if (sscanf(str, "%*s 0x%llx", &address) != 1)
235+
return -EINVAL;
236+
237+
data->op = op;
238+
data->inject.address = address;
239+
240+
return 0;
241+
}
242+
186243
if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
187244
return -EINVAL;
188245

@@ -310,6 +367,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
310367
if (ret)
311368
return -EINVAL;
312369

370+
if (data.op == 3)
371+
{
372+
ret = amdgpu_reserve_page_direct(adev, data.inject.address);
373+
374+
if (ret)
375+
return size;
376+
else
377+
return ret;
378+
}
379+
313380
if (!amdgpu_ras_is_supported(adev, data.head.block))
314381
return -EINVAL;
315382

0 commit comments

Comments
 (0)