Skip to content

Commit e5c04ed

Browse files
ChristianKoenigAMDalexdeucher
authored andcommitted
drm/amdgpu: revert "reserve backup pages for bad page retirment"
As noted during the review this approach doesn't make sense at all. We should not apply any limitation on the VRAM applications can use inside the kernel. If an application or end user wants to reserve a certain amount of VRAM for bad pages handling we should do this in the upper layer. This reverts commit f89b881. Signed-off-by: Christian König <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 6b44b66 commit e5c04ed

File tree

4 files changed

+15
-118
lines changed

4 files changed

+15
-118
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ struct amdgpu_mgpu_info mgpu_info = {
180180
};
181181
int amdgpu_ras_enable = -1;
182182
uint amdgpu_ras_mask = 0xffffffff;
183-
int amdgpu_bad_page_threshold = 100;
183+
int amdgpu_bad_page_threshold = -1;
184184
struct amdgpu_watchdog_timer amdgpu_watchdog_timer = {
185185
.timeout_fatal_disable = false,
186186
.period = 0x23, /* default to max. timeout = 1 << 0x23 cycles */
@@ -854,7 +854,7 @@ module_param_named(reset_method, amdgpu_reset_method, int, 0444);
854854
* faulty pages by ECC exceed threshold value and leave it for user's further
855855
* check.
856856
*/
857-
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto, 0 = disable bad page retirement, 100 = default value");
857+
MODULE_PARM_DESC(bad_page_threshold, "Bad page threshold(-1 = auto(default value), 0 = disable bad page retirement)");
858858
module_param_named(bad_page_threshold, amdgpu_bad_page_threshold, int, 0444);
859859

860860
MODULE_PARM_DESC(num_kcq, "number of kernel compute queue user want to setup (8 if set to greater than 8 or less than 0, only affect gfx 8+)");

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 11 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1790,14 +1790,13 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev,
17901790
return ret;
17911791
}
17921792

1793-
static uint32_t
1794-
amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
1793+
static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
1794+
uint32_t max_length)
17951795
{
1796+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
17961797
int tmp_threshold = amdgpu_bad_page_threshold;
17971798
u64 val;
1798-
uint32_t max_length = 0;
17991799

1800-
max_length = amdgpu_ras_eeprom_get_record_max_length();
18011800
/*
18021801
* Justification of value bad_page_cnt_threshold in ras structure
18031802
*
@@ -1823,18 +1822,20 @@ amdgpu_ras_calculate_badpags_threshold(struct amdgpu_device *adev)
18231822
tmp_threshold = max_length;
18241823

18251824
if (tmp_threshold == -1) {
1826-
val = adev->gmc.real_vram_size;
1825+
val = adev->gmc.mc_vram_size;
18271826
do_div(val, RAS_BAD_PAGE_RATE);
1828-
tmp_threshold = min(lower_32_bits(val), max_length);
1827+
con->bad_page_cnt_threshold = min(lower_32_bits(val),
1828+
max_length);
1829+
} else {
1830+
con->bad_page_cnt_threshold = tmp_threshold;
18291831
}
1830-
1831-
return tmp_threshold;
18321832
}
18331833

18341834
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
18351835
{
18361836
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
18371837
struct ras_err_handler_data **data;
1838+
uint32_t max_eeprom_records_len = 0;
18381839
bool exc_err_limit = false;
18391840
int ret;
18401841

@@ -1854,16 +1855,8 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
18541855
atomic_set(&con->in_recovery, 0);
18551856
con->adev = adev;
18561857

1857-
if (!con->bad_page_cnt_threshold) {
1858-
con->bad_page_cnt_threshold =
1859-
amdgpu_ras_calculate_badpags_threshold(adev);
1860-
1861-
ret = amdgpu_vram_mgr_reserve_backup_pages(
1862-
ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
1863-
con->bad_page_cnt_threshold);
1864-
if (ret)
1865-
goto out;
1866-
}
1858+
max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
1859+
amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
18671860

18681861
/* Todo: During test the SMU might fail to read the eeprom through I2C
18691862
* when the GPU is pending on XGMI reset during probe time

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@ struct amdgpu_vram_mgr {
4848
spinlock_t lock;
4949
struct list_head reservations_pending;
5050
struct list_head reserved_pages;
51-
struct list_head backup_pages;
52-
uint32_t num_backup_pages;
5351
atomic64_t usage;
5452
atomic64_t vis_usage;
5553
};
@@ -124,8 +122,6 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man);
124122
uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
125123
int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
126124
uint64_t start, uint64_t size);
127-
int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
128-
uint32_t num_pages);
129125
int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
130126
uint64_t start);
131127

drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

Lines changed: 2 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,6 @@
2828
#include "amdgpu_atomfirmware.h"
2929
#include "atom.h"
3030

31-
static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
32-
uint32_t num_pages);
33-
3431
static inline struct amdgpu_vram_mgr *to_vram_mgr(struct ttm_resource_manager *man)
3532
{
3633
return container_of(man, struct amdgpu_vram_mgr, manager);
@@ -189,7 +186,6 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev)
189186
spin_lock_init(&mgr->lock);
190187
INIT_LIST_HEAD(&mgr->reservations_pending);
191188
INIT_LIST_HEAD(&mgr->reserved_pages);
192-
INIT_LIST_HEAD(&mgr->backup_pages);
193189

194190
/* Add the two VRAM-related sysfs files */
195191
ret = sysfs_create_files(&adev->dev->kobj, amdgpu_vram_mgr_attributes);
@@ -230,11 +226,6 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
230226
drm_mm_remove_node(&rsv->mm_node);
231227
kfree(rsv);
232228
}
233-
234-
list_for_each_entry_safe(rsv, temp, &mgr->backup_pages, node) {
235-
drm_mm_remove_node(&rsv->mm_node);
236-
kfree(rsv);
237-
}
238229
drm_mm_takedown(&mgr->mm);
239230
spin_unlock(&mgr->lock);
240231

@@ -306,14 +297,12 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
306297
continue;
307298

308299
dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
309-
rsv->mm_node.start << PAGE_SHIFT, rsv->mm_node.size);
300+
rsv->mm_node.start, rsv->mm_node.size);
310301

311302
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
312303
atomic64_add(vis_usage, &mgr->vis_usage);
313304
atomic64_add(rsv->mm_node.size << PAGE_SHIFT, &mgr->usage);
314305
list_move(&rsv->node, &mgr->reserved_pages);
315-
316-
amdgpu_vram_mgr_free_backup_pages(mgr, rsv->mm_node.size);
317306
}
318307
}
319308

@@ -330,7 +319,6 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
330319
uint64_t start, uint64_t size)
331320
{
332321
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
333-
struct amdgpu_device *adev = to_amdgpu_device(mgr);
334322
struct amdgpu_vram_reservation *rsv;
335323

336324
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
@@ -341,94 +329,14 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
341329
rsv->mm_node.start = start >> PAGE_SHIFT;
342330
rsv->mm_node.size = size >> PAGE_SHIFT;
343331

344-
dev_dbg(adev->dev, "Pending Reservation: 0x%llx\n", start);
345-
346332
spin_lock(&mgr->lock);
347-
list_add_tail(&rsv->node, &mgr->reservations_pending);
333+
list_add_tail(&mgr->reservations_pending, &rsv->node);
348334
amdgpu_vram_mgr_do_reserve(man);
349335
spin_unlock(&mgr->lock);
350336

351337
return 0;
352338
}
353339

354-
static int amdgpu_vram_mgr_free_backup_pages(struct amdgpu_vram_mgr *mgr,
355-
uint32_t num_pages)
356-
{
357-
struct amdgpu_device *adev = to_amdgpu_device(mgr);
358-
struct amdgpu_vram_reservation *rsv;
359-
uint32_t i;
360-
uint64_t vis_usage = 0, total_usage = 0;
361-
362-
if (num_pages > mgr->num_backup_pages) {
363-
dev_warn(adev->dev, "No enough backup pages\n");
364-
return -EINVAL;
365-
}
366-
367-
for (i = 0; i < num_pages; i++) {
368-
rsv = list_first_entry(&mgr->backup_pages,
369-
struct amdgpu_vram_reservation, node);
370-
vis_usage += amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
371-
total_usage += (rsv->mm_node.size << PAGE_SHIFT);
372-
drm_mm_remove_node(&rsv->mm_node);
373-
list_del(&rsv->node);
374-
kfree(rsv);
375-
mgr->num_backup_pages--;
376-
}
377-
378-
atomic64_sub(total_usage, &mgr->usage);
379-
atomic64_sub(vis_usage, &mgr->vis_usage);
380-
381-
return 0;
382-
}
383-
384-
int amdgpu_vram_mgr_reserve_backup_pages(struct ttm_resource_manager *man,
385-
uint32_t num_pages)
386-
{
387-
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
388-
struct amdgpu_device *adev = to_amdgpu_device(mgr);
389-
struct amdgpu_vram_reservation *rsv;
390-
struct drm_mm *mm = &mgr->mm;
391-
uint32_t i;
392-
int ret = 0;
393-
uint64_t vis_usage, total_usage;
394-
395-
for (i = 0; i < num_pages; i++) {
396-
rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
397-
if (!rsv) {
398-
ret = -ENOMEM;
399-
goto pro_end;
400-
}
401-
402-
INIT_LIST_HEAD(&rsv->node);
403-
404-
ret = drm_mm_insert_node(mm, &rsv->mm_node, 1);
405-
if (ret) {
406-
dev_err(adev->dev, "failed to reserve backup page %d, ret 0x%x\n", i, ret);
407-
kfree(rsv);
408-
goto pro_end;
409-
}
410-
411-
vis_usage = amdgpu_vram_mgr_vis_size(adev, &rsv->mm_node);
412-
total_usage = (rsv->mm_node.size << PAGE_SHIFT);
413-
414-
spin_lock(&mgr->lock);
415-
atomic64_add(vis_usage, &mgr->vis_usage);
416-
atomic64_add(total_usage, &mgr->usage);
417-
list_add_tail(&rsv->node, &mgr->backup_pages);
418-
mgr->num_backup_pages++;
419-
spin_unlock(&mgr->lock);
420-
}
421-
422-
pro_end:
423-
if (ret) {
424-
spin_lock(&mgr->lock);
425-
amdgpu_vram_mgr_free_backup_pages(mgr, mgr->num_backup_pages);
426-
spin_unlock(&mgr->lock);
427-
}
428-
429-
return ret;
430-
}
431-
432340
/**
433341
* amdgpu_vram_mgr_query_page_status - query the reservation status
434342
*

0 commit comments

Comments
 (0)