Skip to content

Commit be25455

Browse files
jiadozhualexdeucher
authored andcommitted
drm/amdgpu: Modify unmap_queue format for gfx9 (v6)
1. Modify the unmap_queue package on gfx9. Add trailing fence to track the preemption done. 2. Modify emit_ce_meta emit_de_meta functions for the resumed ibs. v2: Restyle code not to use ternary operator. v3: Modify code format. v4: Enable Mid-Command Buffer Preemption for gfx9 by default. v5: Optimize the flag bit set for emit_fence. v6: Modify log message for preemption timeout. Cc: Christian Koenig <[email protected]> Cc: Michel Dänzer <[email protected]> Cc: Luben Tuikov <[email protected]> Signed-off-by: Jiadong.Zhu <[email protected]> Acked-by: Christian König <[email protected]> Acked-by: Huang Rui <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 0c97a19 commit be25455

File tree

3 files changed

+156
-29
lines changed

3 files changed

+156
-29
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ enum amdgpu_ring_priority_level {
6060
#define AMDGPU_FENCE_FLAG_64BIT (1 << 0)
6161
#define AMDGPU_FENCE_FLAG_INT (1 << 1)
6262
#define AMDGPU_FENCE_FLAG_TC_WB_ONLY (1 << 2)
63+
#define AMDGPU_FENCE_FLAG_EXEC (1 << 3)
6364

6465
#define to_amdgpu_ring(s) container_of((s), struct amdgpu_ring, sched)
6566

drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

Lines changed: 153 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -755,7 +755,7 @@ static void gfx_v9_0_set_rlc_funcs(struct amdgpu_device *adev);
755755
static int gfx_v9_0_get_cu_info(struct amdgpu_device *adev,
756756
struct amdgpu_cu_info *cu_info);
757757
static uint64_t gfx_v9_0_get_gpu_clock_counter(struct amdgpu_device *adev);
758-
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring);
758+
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume);
759759
static u64 gfx_v9_0_ring_get_rptr_compute(struct amdgpu_ring *ring);
760760
static void gfx_v9_0_query_ras_error_count(struct amdgpu_device *adev,
761761
void *ras_error_status);
@@ -828,9 +828,10 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring *kiq_ring,
828828
PACKET3_UNMAP_QUEUES_DOORBELL_OFFSET0(ring->doorbell_index));
829829

830830
if (action == PREEMPT_QUEUES_NO_UNMAP) {
831-
amdgpu_ring_write(kiq_ring, lower_32_bits(gpu_addr));
832-
amdgpu_ring_write(kiq_ring, upper_32_bits(gpu_addr));
833-
amdgpu_ring_write(kiq_ring, seq);
831+
amdgpu_ring_write(kiq_ring, lower_32_bits(ring->wptr & ring->buf_mask));
832+
amdgpu_ring_write(kiq_ring, 0);
833+
amdgpu_ring_write(kiq_ring, 0);
834+
834835
} else {
835836
amdgpu_ring_write(kiq_ring, 0);
836837
amdgpu_ring_write(kiq_ring, 0);
@@ -5204,11 +5205,17 @@ static void gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
52045205

52055206
control |= ib->length_dw | (vmid << 24);
52065207

5207-
if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
5208+
if (ib->flags & AMDGPU_IB_FLAG_PREEMPT) {
52085209
control |= INDIRECT_BUFFER_PRE_ENB(1);
52095210

5211+
if (flags & AMDGPU_IB_PREEMPTED)
5212+
control |= INDIRECT_BUFFER_PRE_RESUME(1);
5213+
52105214
if (!(ib->flags & AMDGPU_IB_FLAG_CE) && vmid)
5211-
gfx_v9_0_ring_emit_de_meta(ring);
5215+
gfx_v9_0_ring_emit_de_meta(ring,
5216+
(!amdgpu_sriov_vf(ring->adev) &&
5217+
flags & AMDGPU_IB_PREEMPTED) ?
5218+
true : false);
52125219
}
52135220

52145221
amdgpu_ring_write(ring, header);
@@ -5263,17 +5270,24 @@ static void gfx_v9_0_ring_emit_fence(struct amdgpu_ring *ring, u64 addr,
52635270
bool write64bit = flags & AMDGPU_FENCE_FLAG_64BIT;
52645271
bool int_sel = flags & AMDGPU_FENCE_FLAG_INT;
52655272
bool writeback = flags & AMDGPU_FENCE_FLAG_TC_WB_ONLY;
5273+
bool exec = flags & AMDGPU_FENCE_FLAG_EXEC;
5274+
uint32_t dw2 = 0;
52665275

52675276
/* RELEASE_MEM - flush caches, send int */
52685277
amdgpu_ring_write(ring, PACKET3(PACKET3_RELEASE_MEM, 6));
5269-
amdgpu_ring_write(ring, ((writeback ? (EOP_TC_WB_ACTION_EN |
5270-
EOP_TC_NC_ACTION_EN) :
5271-
(EOP_TCL1_ACTION_EN |
5272-
EOP_TC_ACTION_EN |
5273-
EOP_TC_WB_ACTION_EN |
5274-
EOP_TC_MD_ACTION_EN)) |
5275-
EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
5276-
EVENT_INDEX(5)));
5278+
5279+
if (writeback) {
5280+
dw2 = EOP_TC_NC_ACTION_EN;
5281+
} else {
5282+
dw2 = EOP_TCL1_ACTION_EN | EOP_TC_ACTION_EN |
5283+
EOP_TC_MD_ACTION_EN;
5284+
}
5285+
dw2 |= EOP_TC_WB_ACTION_EN | EVENT_TYPE(CACHE_FLUSH_AND_INV_TS_EVENT) |
5286+
EVENT_INDEX(5);
5287+
if (exec)
5288+
dw2 |= EOP_EXEC;
5289+
5290+
amdgpu_ring_write(ring, dw2);
52775291
amdgpu_ring_write(ring, DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
52785292

52795293
/*
@@ -5378,33 +5392,135 @@ static void gfx_v9_ring_emit_sb(struct amdgpu_ring *ring)
53785392
amdgpu_ring_write(ring, 0);
53795393
}
53805394

5381-
static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
5395+
static void gfx_v9_0_ring_emit_ce_meta(struct amdgpu_ring *ring, bool resume)
53825396
{
5397+
struct amdgpu_device *adev = ring->adev;
53835398
struct v9_ce_ib_state ce_payload = {0};
5384-
uint64_t csa_addr;
5399+
uint64_t offset, ce_payload_gpu_addr;
5400+
void *ce_payload_cpu_addr;
53855401
int cnt;
53865402

53875403
cnt = (sizeof(ce_payload) >> 2) + 4 - 2;
5388-
csa_addr = amdgpu_csa_vaddr(ring->adev);
5404+
5405+
if (ring->is_mes_queue) {
5406+
offset = offsetof(struct amdgpu_mes_ctx_meta_data,
5407+
gfx[0].gfx_meta_data) +
5408+
offsetof(struct v9_gfx_meta_data, ce_payload);
5409+
ce_payload_gpu_addr =
5410+
amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
5411+
ce_payload_cpu_addr =
5412+
amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
5413+
} else {
5414+
offset = offsetof(struct v9_gfx_meta_data, ce_payload);
5415+
ce_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
5416+
ce_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
5417+
}
53895418

53905419
amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, cnt));
53915420
amdgpu_ring_write(ring, (WRITE_DATA_ENGINE_SEL(2) |
53925421
WRITE_DATA_DST_SEL(8) |
53935422
WR_CONFIRM) |
53945423
WRITE_DATA_CACHE_POLICY(0));
5395-
amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
5396-
amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, ce_payload)));
5397-
amdgpu_ring_write_multiple(ring, (void *)&ce_payload, sizeof(ce_payload) >> 2);
5424+
amdgpu_ring_write(ring, lower_32_bits(ce_payload_gpu_addr));
5425+
amdgpu_ring_write(ring, upper_32_bits(ce_payload_gpu_addr));
5426+
5427+
if (resume)
5428+
amdgpu_ring_write_multiple(ring, ce_payload_cpu_addr,
5429+
sizeof(ce_payload) >> 2);
5430+
else
5431+
amdgpu_ring_write_multiple(ring, (void *)&ce_payload,
5432+
sizeof(ce_payload) >> 2);
5433+
}
5434+
5435+
static int gfx_v9_0_ring_preempt_ib(struct amdgpu_ring *ring)
5436+
{
5437+
int i, r = 0;
5438+
struct amdgpu_device *adev = ring->adev;
5439+
struct amdgpu_kiq *kiq = &adev->gfx.kiq;
5440+
struct amdgpu_ring *kiq_ring = &kiq->ring;
5441+
unsigned long flags;
5442+
5443+
if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
5444+
return -EINVAL;
5445+
5446+
spin_lock_irqsave(&kiq->ring_lock, flags);
5447+
5448+
if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
5449+
spin_unlock_irqrestore(&kiq->ring_lock, flags);
5450+
return -ENOMEM;
5451+
}
5452+
5453+
/* assert preemption condition */
5454+
amdgpu_ring_set_preempt_cond_exec(ring, false);
5455+
5456+
ring->trail_seq += 1;
5457+
amdgpu_ring_alloc(ring, 13);
5458+
gfx_v9_0_ring_emit_fence(ring, ring->trail_fence_gpu_addr,
5459+
ring->trail_seq, AMDGPU_FENCE_FLAG_EXEC);
5460+
/*reset the CP_VMID_PREEMPT after trailing fence*/
5461+
amdgpu_ring_emit_wreg(ring,
5462+
SOC15_REG_OFFSET(GC, 0, mmCP_VMID_PREEMPT),
5463+
0x0);
5464+
5465+
/* assert IB preemption, emit the trailing fence */
5466+
kiq->pmf->kiq_unmap_queues(kiq_ring, ring, PREEMPT_QUEUES_NO_UNMAP,
5467+
ring->trail_fence_gpu_addr,
5468+
ring->trail_seq);
5469+
5470+
amdgpu_ring_commit(kiq_ring);
5471+
spin_unlock_irqrestore(&kiq->ring_lock, flags);
5472+
5473+
/* poll the trailing fence */
5474+
for (i = 0; i < adev->usec_timeout; i++) {
5475+
if (ring->trail_seq ==
5476+
le32_to_cpu(*ring->trail_fence_cpu_addr))
5477+
break;
5478+
udelay(1);
5479+
}
5480+
5481+
if (i >= adev->usec_timeout) {
5482+
r = -EINVAL;
5483+
DRM_WARN("ring %d timeout to preempt ib\n", ring->idx);
5484+
}
5485+
5486+
amdgpu_ring_commit(ring);
5487+
5488+
/* deassert preemption condition */
5489+
amdgpu_ring_set_preempt_cond_exec(ring, true);
5490+
return r;
53985491
}
53995492

5400-
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
5493+
static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume)
54015494
{
5495+
struct amdgpu_device *adev = ring->adev;
54025496
struct v9_de_ib_state de_payload = {0};
5403-
uint64_t csa_addr, gds_addr;
5497+
uint64_t offset, gds_addr, de_payload_gpu_addr;
5498+
void *de_payload_cpu_addr;
54045499
int cnt;
54055500

5406-
csa_addr = amdgpu_csa_vaddr(ring->adev);
5407-
gds_addr = csa_addr + 4096;
5501+
if (ring->is_mes_queue) {
5502+
offset = offsetof(struct amdgpu_mes_ctx_meta_data,
5503+
gfx[0].gfx_meta_data) +
5504+
offsetof(struct v9_gfx_meta_data, de_payload);
5505+
de_payload_gpu_addr =
5506+
amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
5507+
de_payload_cpu_addr =
5508+
amdgpu_mes_ctx_get_offs_cpu_addr(ring, offset);
5509+
5510+
offset = offsetof(struct amdgpu_mes_ctx_meta_data,
5511+
gfx[0].gds_backup) +
5512+
offsetof(struct v9_gfx_meta_data, de_payload);
5513+
gds_addr = amdgpu_mes_ctx_get_offs_gpu_addr(ring, offset);
5514+
} else {
5515+
offset = offsetof(struct v9_gfx_meta_data, de_payload);
5516+
de_payload_gpu_addr = amdgpu_csa_vaddr(ring->adev) + offset;
5517+
de_payload_cpu_addr = adev->virt.csa_cpu_addr + offset;
5518+
5519+
gds_addr = ALIGN(amdgpu_csa_vaddr(ring->adev) +
5520+
AMDGPU_CSA_SIZE - adev->gds.gds_size,
5521+
PAGE_SIZE);
5522+
}
5523+
54085524
de_payload.gds_backup_addrlo = lower_32_bits(gds_addr);
54095525
de_payload.gds_backup_addrhi = upper_32_bits(gds_addr);
54105526

@@ -5414,9 +5530,15 @@ static void gfx_v9_0_ring_emit_de_meta(struct amdgpu_ring *ring)
54145530
WRITE_DATA_DST_SEL(8) |
54155531
WR_CONFIRM) |
54165532
WRITE_DATA_CACHE_POLICY(0));
5417-
amdgpu_ring_write(ring, lower_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
5418-
amdgpu_ring_write(ring, upper_32_bits(csa_addr + offsetof(struct v9_gfx_meta_data, de_payload)));
5419-
amdgpu_ring_write_multiple(ring, (void *)&de_payload, sizeof(de_payload) >> 2);
5533+
amdgpu_ring_write(ring, lower_32_bits(de_payload_gpu_addr));
5534+
amdgpu_ring_write(ring, upper_32_bits(de_payload_gpu_addr));
5535+
5536+
if (resume)
5537+
amdgpu_ring_write_multiple(ring, de_payload_cpu_addr,
5538+
sizeof(de_payload) >> 2);
5539+
else
5540+
amdgpu_ring_write_multiple(ring, (void *)&de_payload,
5541+
sizeof(de_payload) >> 2);
54205542
}
54215543

54225544
static void gfx_v9_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start,
@@ -5432,8 +5554,9 @@ static void gfx_v9_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
54325554
{
54335555
uint32_t dw2 = 0;
54345556

5435-
if (amdgpu_sriov_vf(ring->adev))
5436-
gfx_v9_0_ring_emit_ce_meta(ring);
5557+
gfx_v9_0_ring_emit_ce_meta(ring,
5558+
(!amdgpu_sriov_vf(ring->adev) &&
5559+
flags & AMDGPU_IB_PREEMPTED) ? true : false);
54375560

54385561
dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
54395562
if (flags & AMDGPU_HAVE_CTX_SWITCH) {
@@ -6760,6 +6883,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
67606883
.emit_cntxcntl = gfx_v9_ring_emit_cntxcntl,
67616884
.init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
67626885
.patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
6886+
.preempt_ib = gfx_v9_0_ring_preempt_ib,
67636887
.emit_frame_cntl = gfx_v9_0_ring_emit_frame_cntl,
67646888
.emit_wreg = gfx_v9_0_ring_emit_wreg,
67656889
.emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,

drivers/gpu/drm/amd/amdgpu/soc15d.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@
162162
* 2 - Bypass
163163
*/
164164
#define INDIRECT_BUFFER_PRE_ENB(x) ((x) << 21)
165+
#define INDIRECT_BUFFER_PRE_RESUME(x) ((x) << 30)
165166
#define PACKET3_COPY_DATA 0x40
166167
#define PACKET3_PFP_SYNC_ME 0x42
167168
#define PACKET3_COND_WRITE 0x45
@@ -184,6 +185,7 @@
184185
#define EOP_TC_ACTION_EN (1 << 17) /* L2 */
185186
#define EOP_TC_NC_ACTION_EN (1 << 19)
186187
#define EOP_TC_MD_ACTION_EN (1 << 21) /* L2 metadata */
188+
#define EOP_EXEC (1 << 28) /* For Trailing Fence */
187189

188190
#define DATA_SEL(x) ((x) << 29)
189191
/* 0 - discard

0 commit comments

Comments
 (0)