Skip to content

Commit 3bae791

Browse files
committed
drm/amdgpu/sdma: guilty tracking is per instance
The gfx and page queues are per instance, so track them per instance. v2: drop extra parameter (Lijo) Fixes: fdbfaaa ("drm/amdgpu: Improve SDMA reset logic with guilty queue tracking") Reviewed-by: Lijo Lazar <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent e02fcf7 commit 3bae791

File tree

2 files changed

+18
-16
lines changed

2 files changed

+18
-16
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ struct amdgpu_sdma_instance {
6565
uint64_t sdma_fw_gpu_addr;
6666
uint32_t *sdma_fw_ptr;
6767
struct mutex engine_reset_mutex;
68+
/* track guilty state of GFX and PAGE queues */
69+
bool gfx_guilty;
70+
bool page_guilty;
71+
6872
};
6973

7074
enum amdgpu_sdma_ras_memory_id {
@@ -127,9 +131,6 @@ struct amdgpu_sdma {
127131
uint32_t *ip_dump;
128132
uint32_t supported_reset;
129133
struct list_head reset_callback_list;
130-
/* track guilty state of GFX and PAGE queues */
131-
bool gfx_guilty;
132-
bool page_guilty;
133134
};
134135

135136
/*

drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -672,12 +672,11 @@ static uint32_t sdma_v4_4_2_rb_cntl(struct amdgpu_ring *ring, uint32_t rb_cntl)
672672
* @adev: amdgpu_device pointer
673673
* @i: instance to resume
674674
* @restore: used to restore wptr when restart
675-
* @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
676675
*
677676
* Set up the gfx DMA ring buffers and enable them.
678677
* Returns 0 for success, error for failure.
679678
*/
680-
static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
679+
static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
681680
{
682681
struct amdgpu_ring *ring = &adev->sdma.instance[i].ring;
683682
u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -714,7 +713,7 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
714713
/* For the guilty queue, set RPTR to the current wptr to skip bad commands,
715714
* It is not a guilty queue, restore cache_rptr and continue execution.
716715
*/
717-
if (guilty)
716+
if (adev->sdma.instance[i].gfx_guilty)
718717
rwptr = ring->wptr;
719718
else
720719
rwptr = ring->cached_rptr;
@@ -779,12 +778,11 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device *adev, unsigned int i, b
779778
* @adev: amdgpu_device pointer
780779
* @i: instance to resume
781780
* @restore: boolean to say restore needed or not
782-
* @guilty: boolean indicating whether this queue is the guilty one (caused the timeout/error)
783781
*
784782
* Set up the page DMA ring buffers and enable them.
785783
* Returns 0 for success, error for failure.
786784
*/
787-
static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore, bool guilty)
785+
static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i, bool restore)
788786
{
789787
struct amdgpu_ring *ring = &adev->sdma.instance[i].page;
790788
u32 rb_cntl, ib_cntl, wptr_poll_cntl;
@@ -803,7 +801,7 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device *adev, unsigned int i,
803801
/* For the guilty queue, set RPTR to the current wptr to skip bad commands,
804802
* It is not a guilty queue, restore cache_rptr and continue execution.
805803
*/
806-
if (guilty)
804+
if (adev->sdma.instance[i].page_guilty)
807805
rwptr = ring->wptr;
808806
else
809807
rwptr = ring->cached_rptr;
@@ -989,9 +987,9 @@ static int sdma_v4_4_2_inst_start(struct amdgpu_device *adev,
989987
uint32_t temp;
990988

991989
WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0);
992-
sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty);
990+
sdma_v4_4_2_gfx_resume(adev, i, restore);
993991
if (adev->sdma.has_page_queue)
994-
sdma_v4_4_2_page_resume(adev, i, restore, adev->sdma.page_guilty);
992+
sdma_v4_4_2_page_resume(adev, i, restore);
995993

996994
/* set utc l1 enable flag always to 1 */
997995
temp = RREG32_SDMA(i, regSDMA_CNTL);
@@ -1446,6 +1444,10 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
14461444

14471445
for (i = 0; i < adev->sdma.num_instances; i++) {
14481446
mutex_init(&adev->sdma.instance[i].engine_reset_mutex);
1447+
/* Initialize guilty flags for GFX and PAGE queues */
1448+
adev->sdma.instance[i].gfx_guilty = false;
1449+
adev->sdma.instance[i].page_guilty = false;
1450+
14491451
ring = &adev->sdma.instance[i].ring;
14501452
ring->ring_obj = NULL;
14511453
ring->use_doorbell = true;
@@ -1507,9 +1509,6 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
15071509
r = amdgpu_sdma_sysfs_reset_mask_init(adev);
15081510
if (r)
15091511
return r;
1510-
/* Initialize guilty flags for GFX and PAGE queues */
1511-
adev->sdma.gfx_guilty = false;
1512-
adev->sdma.page_guilty = false;
15131512

15141513
return r;
15151514
}
@@ -1689,9 +1688,11 @@ static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_
16891688
return -EINVAL;
16901689

16911690
/* Check if this queue is the guilty one */
1692-
adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
1691+
adev->sdma.instance[instance_id].gfx_guilty =
1692+
sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
16931693
if (adev->sdma.has_page_queue)
1694-
adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
1694+
adev->sdma.instance[instance_id].page_guilty =
1695+
sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
16951696

16961697
/* Cache the rptr before reset, after the reset,
16971698
* all of the registers will be reset to 0

0 commit comments

Comments
 (0)