Skip to content

Commit e02fcf7

Browse files
committed
drm/amdgpu/sdma: fix engine reset handling
Move the kfd suspend/resume code into the caller. That is where the KFD is likely to detect a reset so on the KFD side there is no need to call them. Also add a mutex to lock the actual reset sequence. v2: make the locking per instance Fixes: bac38ca ("drm/amdkfd: implement per queue sdma reset for gfx 9.4+") Reviewed-by: Jesse Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent fc70d1e commit e02fcf7

File tree

4 files changed

+13
-15
lines changed

4 files changed

+13
-15
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,6 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
532532
* amdgpu_sdma_reset_engine - Reset a specific SDMA engine
533533
* @adev: Pointer to the AMDGPU device
534534
* @instance_id: ID of the SDMA engine instance to reset
535-
* @suspend_user_queues: check if suspend user queue.
536535
*
537536
* This function performs the following steps:
538537
* 1. Calls all registered pre_reset callbacks to allow KFD and AMDGPU to save their state.
@@ -541,7 +540,7 @@ void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct
541540
*
542541
* Returns: 0 on success, or a negative error code on failure.
543542
*/
544-
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues)
543+
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
545544
{
546545
struct sdma_on_reset_funcs *funcs;
547546
int ret = 0;
@@ -550,13 +549,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b
550549
struct amdgpu_ring *page_ring = &sdma_instance->page;
551550
bool gfx_sched_stopped = false, page_sched_stopped = false;
552551

553-
/* Suspend KFD if suspend_user_queues is true.
554-
* prevent the destruction of in-flight healthy user queue packets and
555-
* avoid race conditions between KFD and KGD during the reset process.
556-
*/
557-
if (suspend_user_queues)
558-
amdgpu_amdkfd_suspend(adev, false);
559-
552+
mutex_lock(&sdma_instance->engine_reset_mutex);
560553
/* Stop the scheduler's work queue for the GFX and page rings if they are running.
561554
* This ensures that no new tasks are submitted to the queues while
562555
* the reset is in progress.
@@ -617,9 +610,7 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, b
617610
drm_sched_wqueue_start(&page_ring->sched);
618611
}
619612
}
620-
621-
if (suspend_user_queues)
622-
amdgpu_amdkfd_resume(adev, false);
613+
mutex_unlock(&sdma_instance->engine_reset_mutex);
623614

624615
return ret;
625616
}

drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ struct amdgpu_sdma_instance {
6464
struct amdgpu_bo *sdma_fw_obj;
6565
uint64_t sdma_fw_gpu_addr;
6666
uint32_t *sdma_fw_ptr;
67+
struct mutex engine_reset_mutex;
6768
};
6869

6970
enum amdgpu_sdma_ras_memory_id {
@@ -169,7 +170,7 @@ struct amdgpu_buffer_funcs {
169170
};
170171

171172
void amdgpu_sdma_register_on_reset_callbacks(struct amdgpu_device *adev, struct sdma_on_reset_funcs *funcs);
172-
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id, bool suspend_user_queues);
173+
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id);
173174

174175
#define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib), (s), (d), (b), (t))
175176
#define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b))

drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1445,6 +1445,7 @@ static int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
14451445
}
14461446

14471447
for (i = 0; i < adev->sdma.num_instances; i++) {
1448+
mutex_init(&adev->sdma.instance[i].engine_reset_mutex);
14481449
ring = &adev->sdma.instance[i].ring;
14491450
ring->ring_obj = NULL;
14501451
ring->use_doorbell = true;
@@ -1666,11 +1667,16 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring, unsigned int vmid)
16661667
{
16671668
struct amdgpu_device *adev = ring->adev;
16681669
u32 id = GET_INST(SDMA0, ring->me);
1670+
int r;
16691671

16701672
if (!(adev->sdma.supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE))
16711673
return -EOPNOTSUPP;
16721674

1673-
return amdgpu_sdma_reset_engine(adev, id, true);
1675+
amdgpu_amdkfd_suspend(adev, false);
1676+
r = amdgpu_sdma_reset_engine(adev, id);
1677+
amdgpu_amdkfd_resume(adev, false);
1678+
1679+
return r;
16741680
}
16751681

16761682
static int sdma_v4_4_2_stop_queue(struct amdgpu_device *adev, uint32_t instance_id)

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2310,7 +2310,7 @@ static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
23102310
continue;
23112311

23122312
/* Reset engine and check. */
2313-
if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
2313+
if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) ||
23142314
dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
23152315
!set_sdma_queue_as_reset(dqm, doorbell_off)) {
23162316
r = -ENOTRECOVERABLE;

0 commit comments

Comments
 (0)