Skip to content

Commit bcbede6

Browse files
committed
Merge tag 'amd-drm-next-6.5-2023-06-16' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-6.5-2023-06-16: amdgpu: - Misc display fixes - W=1 fixes - Improve scheduler naming - DCN 3.1.4 fixes - kdoc fixes - Enable W=1 - VCN 4.0 fix - xgmi fixes - TOPDOWN fix for large BAR systems - eDP fix - PSR fixes - SubVP fixes - Freesync fix - DPIA fix - SMU 13.0.5 fixes - vblflash fix - RAS fixes - SDMA 4 fix - BO locking fix - BO backing store fix - NBIO 7.9 fixes - GC 9.4.3 fixes - GPU reset recovery fixes - HMM fix amdkfd: - Fix NULL check - Trap fixes - Queue count fix - Add event age tracking radeon: - fbdev client fix scheduler: - Avoid an infinite loop UAPI: - Add KFD event age tracking: Proposed ROCT-Thunk-Interface: ROCm/ROCT-Thunk-Interface@efdbf6c ROCm/ROCT-Thunk-Interface@1820ae0 Proposed ROCR-Runtime: ROCm/ROCR-Runtime@master...zhums:ROCR-Runtime:new_event_wait_review ROCm/ROCR-Runtime@e1f5bdb ROCm/ROCR-Runtime@7d26afd drm: - DP MST fix Signed-off-by: Dave Airlie <[email protected]> From: Alex Deucher <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 4e237d8 + 72f1de4 commit bcbede6

File tree

93 files changed

+1222
-431
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

93 files changed

+1222
-431
lines changed

drivers/gpu/drm/amd/amdgpu/Makefile

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,18 @@ ccflags-y := -I$(FULL_AMD_PATH)/include/asic_reg \
4040
-I$(FULL_AMD_PATH)/amdkfd
4141

4242
subdir-ccflags-y := -Wextra
43-
subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
43+
subdir-ccflags-y += -Wunused
44+
subdir-ccflags-y += -Wmissing-prototypes
45+
subdir-ccflags-y += -Wmissing-declarations
46+
subdir-ccflags-y += -Wmissing-include-dirs
47+
subdir-ccflags-y += -Wold-style-definition
48+
subdir-ccflags-y += -Wmissing-format-attribute
49+
# Need this to avoid recursive variable evaluation issues
50+
cond-flags := $(call cc-option, -Wunused-but-set-variable) \
51+
$(call cc-option, -Wunused-const-variable) \
52+
$(call cc-option, -Wstringop-truncation) \
53+
$(call cc-option, -Wpacked-not-aligned)
54+
subdir-ccflags-y += $(cond-flags)
4455
subdir-ccflags-y += -Wno-unused-parameter
4556
subdir-ccflags-y += -Wno-type-limits
4657
subdir-ccflags-y += -Wno-sign-compare

drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ static int amdgpu_cs_pass1(struct amdgpu_cs_parser *p,
309309
}
310310
p->gang_leader = p->jobs[p->gang_leader_idx];
311311

312-
if (p->ctx->vram_lost_counter != p->gang_leader->vram_lost_counter) {
312+
if (p->ctx->generation != p->gang_leader->generation) {
313313
ret = -ECANCELED;
314314
goto free_all_kdata;
315315
}

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
333333

334334
ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
335335
ctx->reset_counter_query = ctx->reset_counter;
336-
ctx->vram_lost_counter = atomic_read(&mgr->adev->vram_lost_counter);
336+
ctx->generation = amdgpu_vm_generation(mgr->adev, &fpriv->vm);
337337
ctx->init_priority = priority;
338338
ctx->override_priority = AMDGPU_CTX_PRIORITY_UNSET;
339339

@@ -432,6 +432,7 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
432432
u32 ring, struct drm_sched_entity **entity)
433433
{
434434
int r;
435+
struct drm_sched_entity *ctx_entity;
435436

436437
if (hw_ip >= AMDGPU_HW_IP_NUM) {
437438
DRM_ERROR("unknown HW IP type: %d\n", hw_ip);
@@ -455,7 +456,14 @@ int amdgpu_ctx_get_entity(struct amdgpu_ctx *ctx, u32 hw_ip, u32 instance,
455456
return r;
456457
}
457458

458-
*entity = &ctx->entities[hw_ip][ring]->entity;
459+
ctx_entity = &ctx->entities[hw_ip][ring]->entity;
460+
r = drm_sched_entity_error(ctx_entity);
461+
if (r) {
462+
DRM_DEBUG("error entity %p\n", ctx_entity);
463+
return r;
464+
}
465+
466+
*entity = ctx_entity;
459467
return 0;
460468
}
461469

@@ -586,7 +594,7 @@ static int amdgpu_ctx_query2(struct amdgpu_device *adev,
586594
if (ctx->reset_counter != atomic_read(&adev->gpu_reset_counter))
587595
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_RESET;
588596

589-
if (ctx->vram_lost_counter != atomic_read(&adev->vram_lost_counter))
597+
if (ctx->generation != amdgpu_vm_generation(adev, &fpriv->vm))
590598
out->state.flags |= AMDGPU_CTX_QUERY2_FLAGS_VRAMLOST;
591599

592600
if (atomic_read(&ctx->guilty))

drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ struct amdgpu_ctx {
4747
struct amdgpu_ctx_mgr *mgr;
4848
unsigned reset_counter;
4949
unsigned reset_counter_query;
50-
uint32_t vram_lost_counter;
50+
uint64_t generation;
5151
spinlock_t ring_lock;
5252
struct amdgpu_ctx_entity *entities[AMDGPU_HW_IP_NUM][AMDGPU_MAX_ENTITY_NUM];
5353
bool preamble_presented;

drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,7 @@ static int amdgpu_discovery_get_mall_info(struct amdgpu_device *adev)
15151515
mall_size += mall_size_per_umc;
15161516
}
15171517
adev->gmc.mall_size = mall_size;
1518+
adev->gmc.m_half_use = half_use;
15181519
break;
15191520
default:
15201521
dev_err(adev->dev,
@@ -1896,6 +1897,8 @@ static int amdgpu_discovery_set_gc_ip_blocks(struct amdgpu_device *adev)
18961897
amdgpu_device_ip_block_add(adev, &gfx_v9_0_ip_block);
18971898
break;
18981899
case IP_VERSION(9, 4, 3):
1900+
if (!amdgpu_exp_hw_support)
1901+
return -EINVAL;
18991902
amdgpu_device_ip_block_add(adev, &gfx_v9_4_3_ip_block);
19001903
break;
19011904
case IP_VERSION(10, 1, 10):

drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,30 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring)
691691
}
692692
}
693693

694+
/**
695+
* amdgpu_fence_driver_set_error - set error code on fences
696+
* @ring: the ring which contains the fences
697+
* @error: the error code to set
698+
*
699+
* Set an error code to all the fences pending on the ring.
700+
*/
701+
void amdgpu_fence_driver_set_error(struct amdgpu_ring *ring, int error)
702+
{
703+
struct amdgpu_fence_driver *drv = &ring->fence_drv;
704+
unsigned long flags;
705+
706+
spin_lock_irqsave(&drv->lock, flags);
707+
for (unsigned int i = 0; i <= drv->num_fences_mask; ++i) {
708+
struct dma_fence *fence;
709+
710+
fence = rcu_dereference_protected(drv->fences[i],
711+
lockdep_is_held(&drv->lock));
712+
if (fence && !dma_fence_is_signaled_locked(fence))
713+
dma_fence_set_error(fence, error);
714+
}
715+
spin_unlock_irqrestore(&drv->lock, flags);
716+
}
717+
694718
/**
695719
* amdgpu_fence_driver_force_completion - force signal latest fence of ring
696720
*
@@ -699,6 +723,7 @@ void amdgpu_fence_driver_clear_job_fences(struct amdgpu_ring *ring)
699723
*/
700724
void amdgpu_fence_driver_force_completion(struct amdgpu_ring *ring)
701725
{
726+
amdgpu_fence_driver_set_error(ring, -ECANCELED);
702727
amdgpu_fence_write(ring, ring->fence_drv.sync_seq);
703728
amdgpu_fence_process(ring);
704729
}

drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,8 @@ struct amdgpu_gmc {
301301

302302
/* MALL size */
303303
u64 mall_size;
304+
uint32_t m_half_use;
305+
304306
/* number of UMC instances */
305307
int num_umc;
306308
/* mode2 save restore */

drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,8 +190,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier *notifier,
190190
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
191191
hmm_range->start, hmm_range->end);
192192

193-
/* Assuming 512MB takes maxmium 1 second to fault page address */
194-
timeout = max((hmm_range->end - hmm_range->start) >> 29, 1UL);
193+
/* Assuming 128MB takes maximum 1 second to fault page address */
194+
timeout = max((hmm_range->end - hmm_range->start) >> 27, 1UL);
195195
timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
196196
timeout = jiffies + msecs_to_jiffies(timeout);
197197

drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,8 @@ void amdgpu_irq_dispatch(struct amdgpu_device *adev,
467467
handled = true;
468468

469469
} else {
470-
DRM_DEBUG("Unhandled interrupt src_id: %d\n", src_id);
470+
DRM_DEBUG("Unregistered interrupt src_id: %d of client_id:%d\n",
471+
src_id, client_id);
471472
}
472473

473474
/* Send it to amdkfd as well if it isn't already handled */

drivers/gpu/drm/amd/amdgpu/amdgpu_job.c

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
109109
(*job)->vm = vm;
110110

111111
amdgpu_sync_create(&(*job)->explicit_sync);
112-
(*job)->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
112+
(*job)->generation = amdgpu_vm_generation(adev, vm);
113113
(*job)->vm_pd_addr = AMDGPU_BO_INVALID_OFFSET;
114114

115115
if (!entity)
@@ -258,16 +258,27 @@ amdgpu_job_prepare_job(struct drm_sched_job *sched_job,
258258
struct dma_fence *fence = NULL;
259259
int r;
260260

261+
/* Ignore soft recovered fences here */
262+
r = drm_sched_entity_error(s_entity);
263+
if (r && r != -ENODATA)
264+
goto error;
265+
261266
if (!fence && job->gang_submit)
262267
fence = amdgpu_device_switch_gang(ring->adev, job->gang_submit);
263268

264269
while (!fence && job->vm && !job->vmid) {
265270
r = amdgpu_vmid_grab(job->vm, ring, job, &fence);
266-
if (r)
271+
if (r) {
267272
DRM_ERROR("Error getting VM ID (%d)\n", r);
273+
goto error;
274+
}
268275
}
269276

270277
return fence;
278+
279+
error:
280+
dma_fence_set_error(&job->base.s_fence->finished, r);
281+
return NULL;
271282
}
272283

273284
static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
@@ -284,7 +295,7 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
284295
trace_amdgpu_sched_run_job(job);
285296

286297
/* Skip job if VRAM is lost and never resubmit gangs */
287-
if (job->vram_lost_counter != atomic_read(&adev->vram_lost_counter) ||
298+
if (job->generation != amdgpu_vm_generation(adev, job->vm) ||
288299
(job->job_run_counter && job->gang_submit))
289300
dma_fence_set_error(finished, -ECANCELED);
290301

0 commit comments

Comments
 (0)