@@ -382,6 +382,18 @@ static void sdma_v5_0_ring_emit_ib(struct amdgpu_ring *ring,
382
382
unsigned vmid = AMDGPU_JOB_GET_VMID (job );
383
383
uint64_t csa_mc_addr = amdgpu_sdma_get_csa_mc_addr (ring , vmid );
384
384
385
+ /* Invalidate L2, because if we don't do it, we might get stale cache
386
+ * lines from previous IBs.
387
+ */
388
+ amdgpu_ring_write (ring , SDMA_PKT_HEADER_OP (SDMA_OP_GCR_REQ ));
389
+ amdgpu_ring_write (ring , 0 );
390
+ amdgpu_ring_write (ring , (SDMA_GCR_GL2_INV |
391
+ SDMA_GCR_GL2_WB |
392
+ SDMA_GCR_GLM_INV |
393
+ SDMA_GCR_GLM_WB ) << 16 );
394
+ amdgpu_ring_write (ring , 0xffffff80 );
395
+ amdgpu_ring_write (ring , 0xffff );
396
+
385
397
/* An IB packet must end on a 8 DW boundary--the next dword
386
398
* must be on a 8-dword boundary. Our IB packet below is 6
387
399
* dwords long, thus add x number of NOPs, such that, in
@@ -1595,7 +1607,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
1595
1607
SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
1596
1608
SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
1597
1609
10 + 10 + 10 , /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
1598
- .emit_ib_size = 7 + 6 , /* sdma_v5_0_ring_emit_ib */
1610
+ .emit_ib_size = 5 + 7 + 6 , /* sdma_v5_0_ring_emit_ib */
1599
1611
.emit_ib = sdma_v5_0_ring_emit_ib ,
1600
1612
.emit_fence = sdma_v5_0_ring_emit_fence ,
1601
1613
.emit_pipeline_sync = sdma_v5_0_ring_emit_pipeline_sync ,
0 commit comments