Skip to content

Commit 5f2b6c5

Browse files
committed
Merge tag 'drm-fixes-2025-06-20' of https://gitlab.freedesktop.org/drm/kernel
Pull drm fixes from Dave Airlie: "Bit of an uptick in fixes for rc3, msm and amdgpu leading the way, with i915/xe/nouveau with a few each and then some scattered misc bits, nothing looks too crazy: msm: - Display: - Fixed DP output on SDM845 - Fixed 10nm DSI PLL init - GPU: - SUBMIT ioctl error path leak fixes - drm half of stall-on-fault fixes - a7xx: Missing CP_RESET_CONTEXT_STATE - Skip GPU component bind if GPU is not in the device table i915: - Fix MIPI vtotal programming off by one on Broxton - Fix PMU code for GCOV and AutoFDO enabled build xe: - A workaround update - Fix memset on iomem - Fix early wedge on GuC Load failure amdgpu: - DP tunneling fix - LTTPR fix - DSC fix - DML2.x ABGR16161616 fix - RMCM fix - Backlight fixes - GFX11 kicker support - SDMA reset fixes - VCN 5.0.1 fix - Reset fix - Misc small fixes amdkfd: - SDMA reset fix - Fix race in GWS scheduling nouveau: - update docs reference - fix backlight name buffer size - fix UAF in r535 gsp rpc msg - fix undefined shift mgag200: - drop export header ast: - drop export header malidp: - drop informational error ssd130x: - fix clear columns etnaviv: - scheduler locking fix v3d: - null pointer crash fix" * tag 'drm-fixes-2025-06-20' of https://gitlab.freedesktop.org/drm/kernel: (50 commits) drm/xe: Fix early wedge on GuC load failure drm/xe: Fix memset on iomem drm/xe/bmg: Update Wa_16023588340 drm/amdgpu/sdma5.2: init engine reset mutex drm/amdkfd: Fix race in GWS queue scheduling drm/amdgpu/sdma5: init engine reset mutex drm/amdgpu: switch job hw_fence to amdgpu_fence drm/amdgpu: Fix SDMA UTC_L1 handling during start/stop sequences drm/amdgpu: Release reset locks during failures drm/amd/display: Check dce_hwseq before dereferencing it drm/amdgpu: VCN v5_0_1 to prevent FW checking RB during DPG pause drm/amdgpu: Use logical instance ID for SDMA v4_4_2 queue operations drm/amdgpu: Fix SDMA engine reset with logical instance ID drm/amdgpu: add kicker fws loading for gfx11/smu13/psp13 drm/amdgpu: Add kicker device detection drm/amd/display: Export full brightness range to userspace drm/amd/display: Only read ACPI backlight caps once drm/amd/display: Fix RMCM programming seq errors drm/amd/display: Fix mpv playback corruption on weston drm/amd/display: Add more checks for DSC / HUBP ONO guarantees ...
2 parents 0fa5248 + b8de9b2 commit 5f2b6c5

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+598
-200
lines changed

Documentation/gpu/nouveau.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ providing a consistent API to upper layers of the driver stack.
2525
GSP Support
2626
------------------------
2727

28-
.. kernel-doc:: drivers/gpu/drm/nouveau/nvkm/subdev/gsp/r535.c
28+
.. kernel-doc:: drivers/gpu/drm/nouveau/nvkm/subdev/gsp/rm/r535/rpc.c
2929
:doc: GSP message queue element
3030

3131
.. kernel-doc:: drivers/gpu/drm/nouveau/include/nvkm/subdev/gsp.h

drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1902,7 +1902,7 @@ static void amdgpu_ib_preempt_mark_partial_job(struct amdgpu_ring *ring)
19021902
continue;
19031903
}
19041904
job = to_amdgpu_job(s_job);
1905-
if (preempted && (&job->hw_fence) == fence)
1905+
if (preempted && (&job->hw_fence.base) == fence)
19061906
/* mark the job as preempted */
19071907
job->preemption_status |= AMDGPU_IB_PREEMPTED;
19081908
}

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 56 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6019,16 +6019,12 @@ static int amdgpu_device_health_check(struct list_head *device_list_handle)
60196019
return ret;
60206020
}
60216021

6022-
static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
6023-
struct amdgpu_job *job,
6024-
struct amdgpu_reset_context *reset_context,
6025-
struct list_head *device_list,
6026-
struct amdgpu_hive_info *hive,
6027-
bool need_emergency_restart)
6022+
static int amdgpu_device_recovery_prepare(struct amdgpu_device *adev,
6023+
struct list_head *device_list,
6024+
struct amdgpu_hive_info *hive)
60286025
{
6029-
struct list_head *device_list_handle = NULL;
60306026
struct amdgpu_device *tmp_adev = NULL;
6031-
int i, r = 0;
6027+
int r;
60326028

60336029
/*
60346030
* Build list of devices to reset.
@@ -6045,26 +6041,54 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
60456041
}
60466042
if (!list_is_first(&adev->reset_list, device_list))
60476043
list_rotate_to_front(&adev->reset_list, device_list);
6048-
device_list_handle = device_list;
60496044
} else {
60506045
list_add_tail(&adev->reset_list, device_list);
6051-
device_list_handle = device_list;
60526046
}
60536047

60546048
if (!amdgpu_sriov_vf(adev) && (!adev->pcie_reset_ctx.occurs_dpc)) {
6055-
r = amdgpu_device_health_check(device_list_handle);
6049+
r = amdgpu_device_health_check(device_list);
60566050
if (r)
60576051
return r;
60586052
}
60596053

6060-
/* We need to lock reset domain only once both for XGMI and single device */
6061-
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
6062-
reset_list);
6054+
return 0;
6055+
}
6056+
6057+
static void amdgpu_device_recovery_get_reset_lock(struct amdgpu_device *adev,
6058+
struct list_head *device_list)
6059+
{
6060+
struct amdgpu_device *tmp_adev = NULL;
6061+
6062+
if (list_empty(device_list))
6063+
return;
6064+
tmp_adev =
6065+
list_first_entry(device_list, struct amdgpu_device, reset_list);
60636066
amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
6067+
}
60646068

6065-
/* block all schedulers and reset given job's ring */
6066-
list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
6069+
static void amdgpu_device_recovery_put_reset_lock(struct amdgpu_device *adev,
6070+
struct list_head *device_list)
6071+
{
6072+
struct amdgpu_device *tmp_adev = NULL;
60676073

6074+
if (list_empty(device_list))
6075+
return;
6076+
tmp_adev =
6077+
list_first_entry(device_list, struct amdgpu_device, reset_list);
6078+
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6079+
}
6080+
6081+
static int amdgpu_device_halt_activities(
6082+
struct amdgpu_device *adev, struct amdgpu_job *job,
6083+
struct amdgpu_reset_context *reset_context,
6084+
struct list_head *device_list, struct amdgpu_hive_info *hive,
6085+
bool need_emergency_restart)
6086+
{
6087+
struct amdgpu_device *tmp_adev = NULL;
6088+
int i, r = 0;
6089+
6090+
/* block all schedulers and reset given job's ring */
6091+
list_for_each_entry(tmp_adev, device_list, reset_list) {
60686092
amdgpu_device_set_mp1_state(tmp_adev);
60696093

60706094
/*
@@ -6252,11 +6276,6 @@ static void amdgpu_device_gpu_resume(struct amdgpu_device *adev,
62526276
amdgpu_ras_set_error_query_ready(tmp_adev, true);
62536277

62546278
}
6255-
6256-
tmp_adev = list_first_entry(device_list, struct amdgpu_device,
6257-
reset_list);
6258-
amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
6259-
62606279
}
62616280

62626281

@@ -6324,10 +6343,16 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
63246343
reset_context->hive = hive;
63256344
INIT_LIST_HEAD(&device_list);
63266345

6346+
if (amdgpu_device_recovery_prepare(adev, &device_list, hive))
6347+
goto end_reset;
6348+
6349+
/* We need to lock reset domain only once both for XGMI and single device */
6350+
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
6351+
63276352
r = amdgpu_device_halt_activities(adev, job, reset_context, &device_list,
63286353
hive, need_emergency_restart);
63296354
if (r)
6330-
goto end_reset;
6355+
goto reset_unlock;
63316356

63326357
if (need_emergency_restart)
63336358
goto skip_sched_resume;
@@ -6337,21 +6362,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
63376362
*
63386363
* job->base holds a reference to parent fence
63396364
*/
6340-
if (job && dma_fence_is_signaled(&job->hw_fence)) {
6365+
if (job && dma_fence_is_signaled(&job->hw_fence.base)) {
63416366
job_signaled = true;
63426367
dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
63436368
goto skip_hw_reset;
63446369
}
63456370

63466371
r = amdgpu_device_asic_reset(adev, &device_list, reset_context);
63476372
if (r)
6348-
goto end_reset;
6373+
goto reset_unlock;
63496374
skip_hw_reset:
63506375
r = amdgpu_device_sched_resume(&device_list, reset_context, job_signaled);
63516376
if (r)
6352-
goto end_reset;
6377+
goto reset_unlock;
63536378
skip_sched_resume:
63546379
amdgpu_device_gpu_resume(adev, &device_list, need_emergency_restart);
6380+
reset_unlock:
6381+
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
63556382
end_reset:
63566383
if (hive) {
63576384
mutex_unlock(&hive->hive_lock);
@@ -6763,6 +6790,8 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
67636790
memset(&reset_context, 0, sizeof(reset_context));
67646791
INIT_LIST_HEAD(&device_list);
67656792

6793+
amdgpu_device_recovery_prepare(adev, &device_list, hive);
6794+
amdgpu_device_recovery_get_reset_lock(adev, &device_list);
67666795
r = amdgpu_device_halt_activities(adev, NULL, &reset_context, &device_list,
67676796
hive, false);
67686797
if (hive) {
@@ -6880,8 +6909,8 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
68806909
if (hive) {
68816910
list_for_each_entry(tmp_adev, &device_list, reset_list)
68826911
amdgpu_device_unset_mp1_state(tmp_adev);
6883-
amdgpu_device_unlock_reset_domain(adev->reset_domain);
68846912
}
6913+
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
68856914
}
68866915

68876916
if (hive) {
@@ -6927,6 +6956,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev)
69276956

69286957
amdgpu_device_sched_resume(&device_list, NULL, NULL);
69296958
amdgpu_device_gpu_resume(adev, &device_list, false);
6959+
amdgpu_device_recovery_put_reset_lock(adev, &device_list);
69306960
adev->pcie_reset_ctx.occurs_dpc = false;
69316961

69326962
if (hive) {

drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

Lines changed: 7 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -41,22 +41,6 @@
4141
#include "amdgpu_trace.h"
4242
#include "amdgpu_reset.h"
4343

44-
/*
45-
* Fences mark an event in the GPUs pipeline and are used
46-
* for GPU/CPU synchronization. When the fence is written,
47-
* it is expected that all buffers associated with that fence
48-
* are no longer in use by the associated ring on the GPU and
49-
* that the relevant GPU caches have been flushed.
50-
*/
51-
52-
struct amdgpu_fence {
53-
struct dma_fence base;
54-
55-
/* RB, DMA, etc. */
56-
struct amdgpu_ring *ring;
57-
ktime_t start_timestamp;
58-
};
59-
6044
static struct kmem_cache *amdgpu_fence_slab;
6145

6246
int amdgpu_fence_slab_init(void)
@@ -151,12 +135,12 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, struct amd
151135
am_fence = kmem_cache_alloc(amdgpu_fence_slab, GFP_ATOMIC);
152136
if (am_fence == NULL)
153137
return -ENOMEM;
154-
fence = &am_fence->base;
155-
am_fence->ring = ring;
156138
} else {
157139
/* take use of job-embedded fence */
158-
fence = &job->hw_fence;
140+
am_fence = &job->hw_fence;
159141
}
142+
fence = &am_fence->base;
143+
am_fence->ring = ring;
160144

161145
seq = ++ring->fence_drv.sync_seq;
162146
if (job && job->job_run_counter) {
@@ -718,7 +702,7 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring)
718702
* it right here or we won't be able to track them in fence_drv
719703
* and they will remain unsignaled during sa_bo free.
720704
*/
721-
job = container_of(old, struct amdgpu_job, hw_fence);
705+
job = container_of(old, struct amdgpu_job, hw_fence.base);
722706
if (!job->base.s_fence && !dma_fence_is_signaled(old))
723707
dma_fence_signal(old);
724708
RCU_INIT_POINTER(*ptr, NULL);
@@ -780,7 +764,7 @@ static const char *amdgpu_fence_get_timeline_name(struct dma_fence *f)
780764

781765
static const char *amdgpu_job_fence_get_timeline_name(struct dma_fence *f)
782766
{
783-
struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
767+
struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base);
784768

785769
return (const char *)to_amdgpu_ring(job->base.sched)->name;
786770
}
@@ -810,7 +794,7 @@ static bool amdgpu_fence_enable_signaling(struct dma_fence *f)
810794
*/
811795
static bool amdgpu_job_fence_enable_signaling(struct dma_fence *f)
812796
{
813-
struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence);
797+
struct amdgpu_job *job = container_of(f, struct amdgpu_job, hw_fence.base);
814798

815799
if (!timer_pending(&to_amdgpu_ring(job->base.sched)->fence_drv.fallback_timer))
816800
amdgpu_fence_schedule_fallback(to_amdgpu_ring(job->base.sched));
@@ -845,7 +829,7 @@ static void amdgpu_job_fence_free(struct rcu_head *rcu)
845829
struct dma_fence *f = container_of(rcu, struct dma_fence, rcu);
846830

847831
/* free job if fence has a parent job */
848-
kfree(container_of(f, struct amdgpu_job, hw_fence));
832+
kfree(container_of(f, struct amdgpu_job, hw_fence.base));
849833
}
850834

851835
/**

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,8 @@ void amdgpu_job_free_resources(struct amdgpu_job *job)
272272
/* Check if any fences where initialized */
273273
if (job->base.s_fence && job->base.s_fence->finished.ops)
274274
f = &job->base.s_fence->finished;
275-
else if (job->hw_fence.ops)
276-
f = &job->hw_fence;
275+
else if (job->hw_fence.base.ops)
276+
f = &job->hw_fence.base;
277277
else
278278
f = NULL;
279279

@@ -290,10 +290,10 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
290290
amdgpu_sync_free(&job->explicit_sync);
291291

292292
/* only put the hw fence if has embedded fence */
293-
if (!job->hw_fence.ops)
293+
if (!job->hw_fence.base.ops)
294294
kfree(job);
295295
else
296-
dma_fence_put(&job->hw_fence);
296+
dma_fence_put(&job->hw_fence.base);
297297
}
298298

299299
void amdgpu_job_set_gang_leader(struct amdgpu_job *job,
@@ -322,10 +322,10 @@ void amdgpu_job_free(struct amdgpu_job *job)
322322
if (job->gang_submit != &job->base.s_fence->scheduled)
323323
dma_fence_put(job->gang_submit);
324324

325-
if (!job->hw_fence.ops)
325+
if (!job->hw_fence.base.ops)
326326
kfree(job);
327327
else
328-
dma_fence_put(&job->hw_fence);
328+
dma_fence_put(&job->hw_fence.base);
329329
}
330330

331331
struct dma_fence *amdgpu_job_submit(struct amdgpu_job *job)

drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ struct amdgpu_job {
4848
struct drm_sched_job base;
4949
struct amdgpu_vm *vm;
5050
struct amdgpu_sync explicit_sync;
51-
struct dma_fence hw_fence;
51+
struct amdgpu_fence hw_fence;
5252
struct dma_fence *gang_submit;
5353
uint32_t preamble_status;
5454
uint32_t preemption_status;

drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3522,8 +3522,12 @@ int psp_init_sos_microcode(struct psp_context *psp, const char *chip_name)
35223522
uint8_t *ucode_array_start_addr;
35233523
int err = 0;
35243524

3525-
err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, AMDGPU_UCODE_REQUIRED,
3526-
"amdgpu/%s_sos.bin", chip_name);
3525+
if (amdgpu_is_kicker_fw(adev))
3526+
err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, AMDGPU_UCODE_REQUIRED,
3527+
"amdgpu/%s_sos_kicker.bin", chip_name);
3528+
else
3529+
err = amdgpu_ucode_request(adev, &adev->psp.sos_fw, AMDGPU_UCODE_REQUIRED,
3530+
"amdgpu/%s_sos.bin", chip_name);
35273531
if (err)
35283532
goto out;
35293533

@@ -3799,8 +3803,12 @@ int psp_init_ta_microcode(struct psp_context *psp, const char *chip_name)
37993803
struct amdgpu_device *adev = psp->adev;
38003804
int err;
38013805

3802-
err = amdgpu_ucode_request(adev, &adev->psp.ta_fw, AMDGPU_UCODE_REQUIRED,
3803-
"amdgpu/%s_ta.bin", chip_name);
3806+
if (amdgpu_is_kicker_fw(adev))
3807+
err = amdgpu_ucode_request(adev, &adev->psp.ta_fw, AMDGPU_UCODE_REQUIRED,
3808+
"amdgpu/%s_ta_kicker.bin", chip_name);
3809+
else
3810+
err = amdgpu_ucode_request(adev, &adev->psp.ta_fw, AMDGPU_UCODE_REQUIRED,
3811+
"amdgpu/%s_ta.bin", chip_name);
38043812
if (err)
38053813
return err;
38063814

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,22 @@ struct amdgpu_fence_driver {
127127
struct dma_fence **fences;
128128
};
129129

130+
/*
131+
* Fences mark an event in the GPUs pipeline and are used
132+
* for GPU/CPU synchronization. When the fence is written,
133+
* it is expected that all buffers associated with that fence
134+
* are no longer in use by the associated ring on the GPU and
135+
* that the relevant GPU caches have been flushed.
136+
*/
137+
138+
struct amdgpu_fence {
139+
struct dma_fence base;
140+
141+
/* RB, DMA, etc. */
142+
struct amdgpu_ring *ring;
143+
ktime_t start_timestamp;
144+
};
145+
130146
extern const struct drm_sched_backend_ops amdgpu_sched_ops;
131147

132148
void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring);

drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -540,8 +540,10 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
540540
case IP_VERSION(4, 4, 2):
541541
case IP_VERSION(4, 4, 4):
542542
case IP_VERSION(4, 4, 5):
543-
/* For SDMA 4.x, use the existing DPM interface for backward compatibility */
544-
r = amdgpu_dpm_reset_sdma(adev, 1 << instance_id);
543+
/* For SDMA 4.x, use the existing DPM interface for backward compatibility,
544+
* we need to convert the logical instance ID to physical instance ID before reset.
545+
*/
546+
r = amdgpu_dpm_reset_sdma(adev, 1 << GET_INST(SDMA0, instance_id));
545547
break;
546548
case IP_VERSION(5, 0, 0):
547549
case IP_VERSION(5, 0, 1):
@@ -568,7 +570,7 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
568570
/**
569571
* amdgpu_sdma_reset_engine - Reset a specific SDMA engine
570572
* @adev: Pointer to the AMDGPU device
571-
* @instance_id: ID of the SDMA engine instance to reset
573+
* @instance_id: Logical ID of the SDMA engine instance to reset
572574
*
573575
* Returns: 0 on success, or a negative error code on failure.
574576
*/
@@ -601,7 +603,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
601603
/* Perform the SDMA reset for the specified instance */
602604
ret = amdgpu_sdma_soft_reset(adev, instance_id);
603605
if (ret) {
604-
dev_err(adev->dev, "Failed to reset SDMA instance %u\n", instance_id);
606+
dev_err(adev->dev, "Failed to reset SDMA logical instance %u\n", instance_id);
605607
goto exit;
606608
}
607609

0 commit comments

Comments
 (0)