Skip to content

Commit 741d73f

Browse files
committed
Merge tag 'amd-drm-next-6.12-2024-09-06' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.12-2024-09-06: amdgpu: - IPS updates - Post divider fix - DML2 updates - Misc static checker fixes - DCN 3.5 fixes - Replay fixes - DMCUB updates - SWSMU fixes - DP MST fixes - Add debug flag for per queue resets - devcoredump updates - SR-IOV fixes - MES fixes - Always allocate cleared VRAM for GEM - Pipe reset for GC 9.4.3 - ODM policy fixes - Per queue reset support for GC 10 - Per queue reset support for GC 11 - Per queue reset support for GC 12 - Display flickering fixes - MPO fixes - Display sharpening updates amdkfd: - SVM fix for IH for APUs Signed-off-by: Dave Airlie <[email protected]> From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 32bd3eb + 7a19955 commit 741d73f

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+6024
-1359
lines changed

Documentation/gpu/amdgpu/driver-core.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,4 +179,4 @@ IP Blocks
179179
:doc: IP Blocks
180180

181181
.. kernel-doc:: drivers/gpu/drm/amd/include/amd_shared.h
182-
:identifiers: amd_ip_block_type amd_ip_funcs
182+
:identifiers: amd_ip_block_type amd_ip_funcs DC_DEBUG_MASK

drivers/gpu/drm/amd/amdgpu/amdgpu.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1164,6 +1164,7 @@ struct amdgpu_device {
11641164
bool debug_disable_soft_recovery;
11651165
bool debug_use_vram_fw_buf;
11661166
bool debug_enable_ras_aca;
1167+
bool debug_exp_resets;
11671168

11681169
bool enforce_isolation[MAX_XCP];
11691170
/* Added this mutex for cleaner shader isolation between GFX and compute processes */

drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,6 +1151,10 @@ uint64_t kgd_gfx_v9_hqd_get_pq_addr(struct amdgpu_device *adev,
11511151
uint32_t low, high;
11521152
uint64_t queue_addr = 0;
11531153

1154+
if (!adev->debug_exp_resets &&
1155+
!adev->gfx.num_gfx_rings)
1156+
return 0;
1157+
11541158
kgd_gfx_v9_acquire_queue(adev, pipe_id, queue_id, inst);
11551159
amdgpu_gfx_rlc_enter_safe_mode(adev, inst);
11561160

drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.c

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,8 @@
2828
#include "atom.h"
2929

3030
#ifndef CONFIG_DEV_COREDUMP
31-
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
32-
struct amdgpu_reset_context *reset_context)
31+
void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
32+
bool vram_lost, struct amdgpu_job *job)
3333
{
3434
}
3535
#else
@@ -315,7 +315,9 @@ amdgpu_devcoredump_read(char *buffer, loff_t offset, size_t count,
315315
}
316316
}
317317

318-
if (coredump->reset_vram_lost)
318+
if (coredump->skip_vram_check)
319+
drm_printf(&p, "VRAM lost check is skipped!\n");
320+
else if (coredump->reset_vram_lost)
319321
drm_printf(&p, "VRAM is lost due to GPU reset!\n");
320322

321323
return count - iter.remain;
@@ -326,12 +328,11 @@ static void amdgpu_devcoredump_free(void *data)
326328
kfree(data);
327329
}
328330

329-
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
330-
struct amdgpu_reset_context *reset_context)
331+
void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
332+
bool vram_lost, struct amdgpu_job *job)
331333
{
332-
struct amdgpu_coredump_info *coredump;
333334
struct drm_device *dev = adev_to_drm(adev);
334-
struct amdgpu_job *job = reset_context->job;
335+
struct amdgpu_coredump_info *coredump;
335336
struct drm_sched_job *s_job;
336337

337338
coredump = kzalloc(sizeof(*coredump), GFP_NOWAIT);
@@ -341,11 +342,12 @@ void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
341342
return;
342343
}
343344

345+
coredump->skip_vram_check = skip_vram_check;
344346
coredump->reset_vram_lost = vram_lost;
345347

346-
if (reset_context->job && reset_context->job->vm) {
348+
if (job && job->vm) {
349+
struct amdgpu_vm *vm = job->vm;
347350
struct amdgpu_task_info *ti;
348-
struct amdgpu_vm *vm = reset_context->job->vm;
349351

350352
ti = amdgpu_vm_get_task_info_vm(vm);
351353
if (ti) {

drivers/gpu/drm/amd/amdgpu/amdgpu_dev_coredump.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
#define __AMDGPU_DEV_COREDUMP_H__
2727

2828
#include "amdgpu.h"
29-
#include "amdgpu_reset.h"
3029

3130
#ifdef CONFIG_DEV_COREDUMP
3231

@@ -36,12 +35,12 @@ struct amdgpu_coredump_info {
3635
struct amdgpu_device *adev;
3736
struct amdgpu_task_info reset_task_info;
3837
struct timespec64 reset_time;
38+
bool skip_vram_check;
3939
bool reset_vram_lost;
4040
struct amdgpu_ring *ring;
4141
};
4242
#endif
4343

44-
void amdgpu_coredump(struct amdgpu_device *adev, bool vram_lost,
45-
struct amdgpu_reset_context *reset_context);
46-
44+
void amdgpu_coredump(struct amdgpu_device *adev, bool skip_vram_check,
45+
bool vram_lost, struct amdgpu_job *job);
4746
#endif

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4531,6 +4531,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
45314531
{
45324532
dev_info(adev->dev, "amdgpu: finishing device.\n");
45334533
flush_delayed_work(&adev->delayed_init_work);
4534+
4535+
if (adev->mman.initialized)
4536+
drain_workqueue(adev->mman.bdev.wq);
45344537
adev->shutdown = true;
45354538

45364539
/* make sure IB test finished before entering exclusive mode
@@ -4551,9 +4554,6 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
45514554
}
45524555
amdgpu_fence_driver_hw_fini(adev);
45534556

4554-
if (adev->mman.initialized)
4555-
drain_workqueue(adev->mman.bdev.wq);
4556-
45574557
if (adev->pm.sysfs_initialized)
45584558
amdgpu_pm_sysfs_fini(adev);
45594559
if (adev->ucode_sysfs_en)
@@ -5489,7 +5489,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
54895489
vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
54905490

54915491
if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags))
5492-
amdgpu_coredump(tmp_adev, vram_lost, reset_context);
5492+
amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job);
54935493

54945494
if (vram_lost) {
54955495
DRM_INFO("VRAM is lost due to GPU reset!\n");

drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ enum AMDGPU_DEBUG_MASK {
131131
AMDGPU_DEBUG_DISABLE_GPU_SOFT_RECOVERY = BIT(2),
132132
AMDGPU_DEBUG_USE_VRAM_FW_BUF = BIT(3),
133133
AMDGPU_DEBUG_ENABLE_RAS_ACA = BIT(4),
134+
AMDGPU_DEBUG_ENABLE_EXP_RESETS = BIT(5),
134135
};
135136

136137
unsigned int amdgpu_vram_limit = UINT_MAX;
@@ -2199,6 +2200,11 @@ static void amdgpu_init_debug_options(struct amdgpu_device *adev)
21992200
pr_info("debug: enable RAS ACA\n");
22002201
adev->debug_enable_ras_aca = true;
22012202
}
2203+
2204+
if (amdgpu_debug_mask & AMDGPU_DEBUG_ENABLE_EXP_RESETS) {
2205+
pr_info("debug: enable experimental reset features\n");
2206+
adev->debug_exp_resets = true;
2207+
}
22022208
}
22032209

22042210
static unsigned long amdgpu_fix_asic_type(struct pci_dev *pdev, unsigned long flags)

drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,9 @@ int amdgpu_gem_create_ioctl(struct drm_device *dev, void *data,
348348
return -EINVAL;
349349
}
350350

351+
/* always clear VRAM */
352+
flags |= AMDGPU_GEM_CREATE_VRAM_CLEARED;
353+
351354
/* create a gem object to contain this object in */
352355
if (args->in.domains & (AMDGPU_GEM_DOMAIN_GDS |
353356
AMDGPU_GEM_DOMAIN_GWS | AMDGPU_GEM_DOMAIN_OA)) {

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -660,7 +660,7 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id)
660660
uint64_t queue_mask = 0;
661661
int r, i, j;
662662

663-
if (adev->enable_mes)
663+
if (adev->mes.enable_legacy_queue_map)
664664
return amdgpu_gfx_mes_enable_kcq(adev, xcc_id);
665665

666666
if (!kiq->pmf || !kiq->pmf->kiq_map_queues || !kiq->pmf->kiq_set_resources)
@@ -722,7 +722,7 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id)
722722

723723
amdgpu_device_flush_hdp(adev, NULL);
724724

725-
if (adev->enable_mes) {
725+
if (adev->mes.enable_legacy_queue_map) {
726726
for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
727727
j = i + xcc_id * adev->gfx.num_gfx_rings;
728728
r = amdgpu_mes_map_legacy_queue(adev,

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,60 @@
3030
#include "amdgpu.h"
3131
#include "amdgpu_trace.h"
3232
#include "amdgpu_reset.h"
33+
#include "amdgpu_dev_coredump.h"
34+
#include "amdgpu_xgmi.h"
35+
36+
static void amdgpu_job_do_core_dump(struct amdgpu_device *adev,
37+
struct amdgpu_job *job)
38+
{
39+
int i;
40+
41+
dev_info(adev->dev, "Dumping IP State\n");
42+
for (i = 0; i < adev->num_ip_blocks; i++)
43+
if (adev->ip_blocks[i].version->funcs->dump_ip_state)
44+
adev->ip_blocks[i].version->funcs
45+
->dump_ip_state((void *)adev);
46+
dev_info(adev->dev, "Dumping IP State Completed\n");
47+
48+
amdgpu_coredump(adev, true, false, job);
49+
}
50+
51+
static void amdgpu_job_core_dump(struct amdgpu_device *adev,
52+
struct amdgpu_job *job)
53+
{
54+
struct list_head device_list, *device_list_handle = NULL;
55+
struct amdgpu_device *tmp_adev = NULL;
56+
struct amdgpu_hive_info *hive = NULL;
57+
58+
if (!amdgpu_sriov_vf(adev))
59+
hive = amdgpu_get_xgmi_hive(adev);
60+
if (hive)
61+
mutex_lock(&hive->hive_lock);
62+
/*
63+
* Reuse the logic in amdgpu_device_gpu_recover() to build list of
64+
* devices for code dump
65+
*/
66+
INIT_LIST_HEAD(&device_list);
67+
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) {
68+
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
69+
list_add_tail(&tmp_adev->reset_list, &device_list);
70+
if (!list_is_first(&adev->reset_list, &device_list))
71+
list_rotate_to_front(&adev->reset_list, &device_list);
72+
device_list_handle = &device_list;
73+
} else {
74+
list_add_tail(&adev->reset_list, &device_list);
75+
device_list_handle = &device_list;
76+
}
77+
78+
/* Do the coredump for each device */
79+
list_for_each_entry(tmp_adev, device_list_handle, reset_list)
80+
amdgpu_job_do_core_dump(tmp_adev, job);
81+
82+
if (hive) {
83+
mutex_unlock(&hive->hive_lock);
84+
amdgpu_put_xgmi_hive(hive);
85+
}
86+
}
3387

3488
static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
3589
{
@@ -48,9 +102,14 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
48102
return DRM_GPU_SCHED_STAT_ENODEV;
49103
}
50104

51-
52105
adev->job_hang = true;
53106

107+
/*
108+
* Do the coredump immediately after a job timeout to get a very
109+
* close dump/snapshot/representation of GPU's current error status
110+
*/
111+
amdgpu_job_core_dump(adev, job);
112+
54113
if (amdgpu_gpu_recovery &&
55114
amdgpu_ring_soft_recovery(ring, job->vmid, s_job->s_fence->parent)) {
56115
dev_err(adev->dev, "ring %s timeout, but soft recovered\n",
@@ -101,6 +160,12 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
101160
reset_context.src = AMDGPU_RESET_SRC_JOB;
102161
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
103162

163+
/*
164+
* To avoid an unnecessary extra coredump, as we have already
165+
* got the very close representation of GPU's error status
166+
*/
167+
set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);
168+
104169
r = amdgpu_device_gpu_recover(ring->adev, job, &reset_context);
105170
if (r)
106171
dev_err(adev->dev, "GPU Recovery Failed: %d\n", r);

0 commit comments

Comments
 (0)