Skip to content

Commit 56b0989

Browse files
ChristianKoenigAMDalexdeucher
authored andcommitted
drm/amdgpu: fix GDS/GWS/OA switch handling
Bas pointed out that this isn't working as expected and could cause crashes. Fix the handling by storing the marker that a switch is needed inside the job instead. Reported-by: Bas Nieuwenhuizen <[email protected]> Signed-off-by: Christian König <[email protected]> Reviewed-by: Alex Deucher <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent e0607c1 commit 56b0989

File tree

3 files changed

+54
-43
lines changed

3 files changed

+54
-43
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,26 @@ bool amdgpu_vmid_had_gpu_reset(struct amdgpu_device *adev,
165165
atomic_read(&adev->gpu_reset_counter);
166166
}
167167

168+
/* Check if we need to switch to another set of resources */
169+
static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
170+
struct amdgpu_job *job)
171+
{
172+
return id->gds_base != job->gds_base ||
173+
id->gds_size != job->gds_size ||
174+
id->gws_base != job->gws_base ||
175+
id->gws_size != job->gws_size ||
176+
id->oa_base != job->oa_base ||
177+
id->oa_size != job->oa_size;
178+
}
179+
180+
/* Check if the id is compatible with the job */
181+
static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
182+
struct amdgpu_job *job)
183+
{
184+
return id->pd_gpu_addr == job->vm_pd_addr &&
185+
!amdgpu_vmid_gds_switch_needed(id, job);
186+
}
187+
168188
/**
169189
* amdgpu_vmid_grab_idle - grab idle VMID
170190
*
@@ -265,7 +285,7 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
265285

266286
*id = vm->reserved_vmid[vmhub];
267287
if ((*id)->owner != vm->immediate.fence_context ||
268-
(*id)->pd_gpu_addr != job->vm_pd_addr ||
288+
!amdgpu_vmid_compatible(*id, job) ||
269289
(*id)->flushed_updates < updates ||
270290
!(*id)->last_flush ||
271291
((*id)->last_flush->context != fence_context &&
@@ -294,7 +314,6 @@ static int amdgpu_vmid_grab_reserved(struct amdgpu_vm *vm,
294314
if (r)
295315
return r;
296316

297-
(*id)->flushed_updates = updates;
298317
job->vm_needs_flush = needs_flush;
299318
return 0;
300319
}
@@ -333,7 +352,7 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
333352
if ((*id)->owner != vm->immediate.fence_context)
334353
continue;
335354

336-
if ((*id)->pd_gpu_addr != job->vm_pd_addr)
355+
if (!amdgpu_vmid_compatible(*id, job))
337356
continue;
338357

339358
if (!(*id)->last_flush ||
@@ -355,7 +374,6 @@ static int amdgpu_vmid_grab_used(struct amdgpu_vm *vm,
355374
if (r)
356375
return r;
357376

358-
(*id)->flushed_updates = updates;
359377
job->vm_needs_flush |= needs_flush;
360378
return 0;
361379
}
@@ -408,22 +426,30 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct amdgpu_ring *ring,
408426
if (r)
409427
goto error;
410428

411-
id->flushed_updates = amdgpu_vm_tlb_seq(vm);
412429
job->vm_needs_flush = true;
413430
}
414431

415432
list_move_tail(&id->list, &id_mgr->ids_lru);
416433
}
417434

418-
id->pd_gpu_addr = job->vm_pd_addr;
419-
id->owner = vm->immediate.fence_context;
420-
435+
job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
421436
if (job->vm_needs_flush) {
437+
id->flushed_updates = amdgpu_vm_tlb_seq(vm);
422438
dma_fence_put(id->last_flush);
423439
id->last_flush = NULL;
424440
}
425441
job->vmid = id - id_mgr->ids;
426442
job->pasid = vm->pasid;
443+
444+
id->gds_base = job->gds_base;
445+
id->gds_size = job->gds_size;
446+
id->gws_base = job->gws_base;
447+
id->gws_size = job->gws_size;
448+
id->oa_base = job->oa_base;
449+
id->oa_size = job->oa_size;
450+
id->pd_gpu_addr = job->vm_pd_addr;
451+
id->owner = vm->immediate.fence_context;
452+
427453
trace_amdgpu_vm_grab_id(vm, ring, job);
428454

429455
error:

drivers/gpu/drm/amd/amdgpu/amdgpu_job.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ struct amdgpu_job {
5353
uint32_t preamble_status;
5454
uint32_t preemption_status;
5555
bool vm_needs_flush;
56+
bool gds_switch_needed;
5657
uint64_t vm_pd_addr;
5758
unsigned vmid;
5859
unsigned pasid;

drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

Lines changed: 19 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -484,25 +484,20 @@ bool amdgpu_vm_need_pipeline_sync(struct amdgpu_ring *ring,
484484
struct amdgpu_device *adev = ring->adev;
485485
unsigned vmhub = ring->funcs->vmhub;
486486
struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
487-
struct amdgpu_vmid *id;
488-
bool gds_switch_needed;
489-
bool vm_flush_needed = job->vm_needs_flush || ring->has_compute_vm_bug;
490487

491488
if (job->vmid == 0)
492489
return false;
493-
id = &id_mgr->ids[job->vmid];
494-
gds_switch_needed = ring->funcs->emit_gds_switch && (
495-
id->gds_base != job->gds_base ||
496-
id->gds_size != job->gds_size ||
497-
id->gws_base != job->gws_base ||
498-
id->gws_size != job->gws_size ||
499-
id->oa_base != job->oa_base ||
500-
id->oa_size != job->oa_size);
501-
502-
if (amdgpu_vmid_had_gpu_reset(adev, id))
490+
491+
if (job->vm_needs_flush || ring->has_compute_vm_bug)
492+
return true;
493+
494+
if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
503495
return true;
504496

505-
return vm_flush_needed || gds_switch_needed;
497+
if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
498+
return true;
499+
500+
return false;
506501
}
507502

508503
/**
@@ -524,13 +519,8 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
524519
unsigned vmhub = ring->funcs->vmhub;
525520
struct amdgpu_vmid_mgr *id_mgr = &adev->vm_manager.id_mgr[vmhub];
526521
struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
527-
bool gds_switch_needed = ring->funcs->emit_gds_switch && (
528-
id->gds_base != job->gds_base ||
529-
id->gds_size != job->gds_size ||
530-
id->gws_base != job->gws_base ||
531-
id->gws_size != job->gws_size ||
532-
id->oa_base != job->oa_base ||
533-
id->oa_size != job->oa_size);
522+
bool gds_switch_needed = ring->funcs->emit_gds_switch &&
523+
job->gds_switch_needed;
534524
bool vm_flush_needed = job->vm_needs_flush;
535525
struct dma_fence *fence = NULL;
536526
bool pasid_mapping_needed = false;
@@ -577,6 +567,14 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
577567
if (pasid_mapping_needed)
578568
amdgpu_gmc_emit_pasid_mapping(ring, job->vmid, job->pasid);
579569

570+
if (!ring->is_mes_queue && ring->funcs->emit_gds_switch &&
571+
gds_switch_needed) {
572+
amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
573+
job->gds_size, job->gws_base,
574+
job->gws_size, job->oa_base,
575+
job->oa_size);
576+
}
577+
580578
if (vm_flush_needed || pasid_mapping_needed) {
581579
r = amdgpu_fence_emit(ring, &fence, NULL, 0);
582580
if (r)
@@ -601,20 +599,6 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job,
601599
}
602600
dma_fence_put(fence);
603601

604-
if (!ring->is_mes_queue && ring->funcs->emit_gds_switch &&
605-
gds_switch_needed) {
606-
id->gds_base = job->gds_base;
607-
id->gds_size = job->gds_size;
608-
id->gws_base = job->gws_base;
609-
id->gws_size = job->gws_size;
610-
id->oa_base = job->oa_base;
611-
id->oa_size = job->oa_size;
612-
amdgpu_ring_emit_gds_switch(ring, job->vmid, job->gds_base,
613-
job->gds_size, job->gws_base,
614-
job->gws_size, job->oa_base,
615-
job->oa_size);
616-
}
617-
618602
if (ring->funcs->patch_cond_exec)
619603
amdgpu_ring_patch_cond_exec(ring, patch_offset);
620604

0 commit comments

Comments
 (0)