Skip to content

Commit 0c3c2e3

Browse files
committed
drm/amdgpu/sdma: allow caller to handle kernel rings in engine reset
Add a parameter to amdgpu_sdma_reset_engine() to let the caller handle the kernel rings. This allows the kernel rings to back up their unprocessed state if the reset comes in via the drm scheduler rather than KFD. Reviewed-by: Jesse Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent f8410a1 commit 0c3c2e3

File tree

6 files changed

+32
-23
lines changed

6 files changed

+32
-23
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -545,25 +545,31 @@ static int amdgpu_sdma_soft_reset(struct amdgpu_device *adev, u32 instance_id)
545545
* amdgpu_sdma_reset_engine - Reset a specific SDMA engine
546546
* @adev: Pointer to the AMDGPU device
547547
* @instance_id: Logical ID of the SDMA engine instance to reset
548+
* @caller_handles_kernel_queues: Skip kernel queue processing. Caller
549+
* will handle it.
548550
*
549551
* Returns: 0 on success, or a negative error code on failure.
550552
*/
551-
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
553+
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
554+
bool caller_handles_kernel_queues)
552555
{
553556
int ret = 0;
554557
struct amdgpu_sdma_instance *sdma_instance = &adev->sdma.instance[instance_id];
555558
struct amdgpu_ring *gfx_ring = &sdma_instance->ring;
556559
struct amdgpu_ring *page_ring = &sdma_instance->page;
557560

558561
mutex_lock(&sdma_instance->engine_reset_mutex);
559-
/* Stop the scheduler's work queue for the GFX and page rings if they are running.
560-
* This ensures that no new tasks are submitted to the queues while
561-
* the reset is in progress.
562-
*/
563-
drm_sched_wqueue_stop(&gfx_ring->sched);
564562

565-
if (adev->sdma.has_page_queue)
566-
drm_sched_wqueue_stop(&page_ring->sched);
563+
if (!caller_handles_kernel_queues) {
564+
/* Stop the scheduler's work queue for the GFX and page rings if they are running.
565+
* This ensures that no new tasks are submitted to the queues while
566+
* the reset is in progress.
567+
*/
568+
drm_sched_wqueue_stop(&gfx_ring->sched);
569+
570+
if (adev->sdma.has_page_queue)
571+
drm_sched_wqueue_stop(&page_ring->sched);
572+
}
567573

568574
if (sdma_instance->funcs->stop_kernel_queue) {
569575
sdma_instance->funcs->stop_kernel_queue(gfx_ring);
@@ -585,16 +591,18 @@ int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id)
585591
}
586592

587593
exit:
588-
/* Restart the scheduler's work queue for the GFX and page rings
589-
* if they were stopped by this function. This allows new tasks
590-
* to be submitted to the queues after the reset is complete.
591-
*/
592-
if (!ret) {
593-
amdgpu_fence_driver_force_completion(gfx_ring);
594-
drm_sched_wqueue_start(&gfx_ring->sched);
595-
if (adev->sdma.has_page_queue) {
596-
amdgpu_fence_driver_force_completion(page_ring);
597-
drm_sched_wqueue_start(&page_ring->sched);
594+
if (!caller_handles_kernel_queues) {
595+
/* Restart the scheduler's work queue for the GFX and page rings
596+
* if they were stopped by this function. This allows new tasks
597+
* to be submitted to the queues after the reset is complete.
598+
*/
599+
if (!ret) {
600+
amdgpu_fence_driver_force_completion(gfx_ring);
601+
drm_sched_wqueue_start(&gfx_ring->sched);
602+
if (adev->sdma.has_page_queue) {
603+
amdgpu_fence_driver_force_completion(page_ring);
604+
drm_sched_wqueue_start(&page_ring->sched);
605+
}
598606
}
599607
}
600608
mutex_unlock(&sdma_instance->engine_reset_mutex);

drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,8 @@ struct amdgpu_buffer_funcs {
172172
uint32_t byte_count);
173173
};
174174

175-
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id);
175+
int amdgpu_sdma_reset_engine(struct amdgpu_device *adev, uint32_t instance_id,
176+
bool caller_handles_kernel_queues);
176177

177178
#define amdgpu_emit_copy_buffer(adev, ib, s, d, b, t) (adev)->mman.buffer_funcs->emit_copy_buffer((ib), (s), (d), (b), (t))
178179
#define amdgpu_emit_fill_buffer(adev, ib, s, d, b) (adev)->mman.buffer_funcs->emit_fill_buffer((ib), (s), (d), (b))

drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1668,7 +1668,7 @@ static int sdma_v4_4_2_reset_queue(struct amdgpu_ring *ring,
16681668
return -EOPNOTSUPP;
16691669

16701670
amdgpu_amdkfd_suspend(adev, true);
1671-
r = amdgpu_sdma_reset_engine(adev, id);
1671+
r = amdgpu_sdma_reset_engine(adev, id, false);
16721672
amdgpu_amdkfd_resume(adev, true);
16731673
return r;
16741674
}

drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1548,7 +1548,7 @@ static int sdma_v5_0_reset_queue(struct amdgpu_ring *ring,
15481548
int r;
15491549

15501550
amdgpu_amdkfd_suspend(adev, true);
1551-
r = amdgpu_sdma_reset_engine(adev, inst_id);
1551+
r = amdgpu_sdma_reset_engine(adev, inst_id, false);
15521552
amdgpu_amdkfd_resume(adev, true);
15531553

15541554
return r;

drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1461,7 +1461,7 @@ static int sdma_v5_2_reset_queue(struct amdgpu_ring *ring,
14611461
int r;
14621462

14631463
amdgpu_amdkfd_suspend(adev, true);
1464-
r = amdgpu_sdma_reset_engine(adev, inst_id);
1464+
r = amdgpu_sdma_reset_engine(adev, inst_id, false);
14651465
amdgpu_amdkfd_resume(adev, true);
14661466

14671467
return r;

drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2312,7 +2312,7 @@ static int reset_hung_queues_sdma(struct device_queue_manager *dqm)
23122312
continue;
23132313

23142314
/* Reset engine and check. */
2315-
if (amdgpu_sdma_reset_engine(dqm->dev->adev, i) ||
2315+
if (amdgpu_sdma_reset_engine(dqm->dev->adev, i, false) ||
23162316
dqm->dev->kfd2kgd->hqd_sdma_get_doorbell(dqm->dev->adev, i, j) ||
23172317
!set_sdma_queue_as_reset(dqm, doorbell_off)) {
23182318
r = -ENOTRECOVERABLE;

0 commit comments

Comments
 (0)