Skip to content

Commit 8dba56c

Browse files
committed
Addressed feedback: Change to Enable, Reject scalar and vector clustering
1 parent 608b002 commit 8dba56c

File tree

10 files changed

+483
-465
lines changed

10 files changed

+483
-465
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,10 @@ namespace llvm::AMDGPU {
4747
#include "AMDGPUGenSearchableTables.inc"
4848
} // namespace llvm::AMDGPU
4949

50-
static cl::opt<bool> DisableDiffBasePtrMemClustering(
51-
"amdgpu-disable-diff-baseptr-mem-clustering",
52-
cl::desc("Disable clustering memory ops with different base pointers"),
53-
cl::init(false), cl::Hidden);
50+
static cl::opt<bool> EnableDiffBasePtrMemClustering(
51+
"amdgpu-enable-diff-baseptr-mem-clustering",
52+
cl::desc("Enable clustering memory ops with different base pointers"),
53+
cl::init(true), cl::Hidden);
5454

5555
// Must be at least 4 to be able to branch over minimum unconditional branch
5656
// code. This is only for making it possible to write reasonably small tests for
@@ -585,10 +585,24 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
585585
const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
586586
const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
587587

588-
if (!DisableDiffBasePtrMemClustering) {
588+
if (EnableDiffBasePtrMemClustering) {
589589
// Only consider memory ops from same addrspace for clustering
590590
if (!memOpsHaveSameAddrspace(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
591591
return false;
592+
593+
// Don't cluster scalar and vecter memory ops
594+
const MachineFunction &MF = *FirstLdSt.getParent()->getParent();
595+
const MachineRegisterInfo &MRI = MF.getRegInfo();
596+
if (FirstLdSt.getOperand(0).isReg() &&
597+
SecondLdSt.getOperand(0).isReg()) {
598+
bool isFirstVecReg = RI.isVectorRegister(MRI,
599+
FirstLdSt.getOperand(0).getReg());
600+
bool isSecondVecReg = RI.isVectorRegister(MRI,
601+
SecondLdSt.getOperand(0).getReg());
602+
if ((isFirstVecReg && !isSecondVecReg) ||
603+
(!isFirstVecReg && isSecondVecReg))
604+
return false;
605+
}
592606
} else {
593607
// If the mem ops (to be clustered) do not have the same base ptr, then
594608
// they should not be clustered

llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -288,16 +288,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
288288
; GFX8V4-NEXT: v_mov_b32_e32 v0, s0
289289
; GFX8V4-NEXT: v_mov_b32_e32 v1, s1
290290
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
291-
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
292291
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
293292
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
294293
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
295294
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
295+
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
296296
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
297297
; GFX8V4-NEXT: v_mov_b32_e32 v0, s10
298+
; GFX8V4-NEXT: v_mov_b32_e32 v1, s11
298299
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
299300
; GFX8V4-NEXT: v_mov_b32_e32 v3, s1
300-
; GFX8V4-NEXT: v_mov_b32_e32 v1, s11
301301
; GFX8V4-NEXT: v_mov_b32_e32 v2, s0
302302
; GFX8V4-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
303303
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
@@ -314,16 +314,16 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
314314
; GFX8V5-NEXT: v_mov_b32_e32 v0, s0
315315
; GFX8V5-NEXT: v_mov_b32_e32 v1, s1
316316
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
317-
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
318317
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
319318
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
320319
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
321320
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
321+
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
322322
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
323323
; GFX8V5-NEXT: v_mov_b32_e32 v0, s10
324+
; GFX8V5-NEXT: v_mov_b32_e32 v1, s11
324325
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
325326
; GFX8V5-NEXT: v_mov_b32_e32 v3, s1
326-
; GFX8V5-NEXT: v_mov_b32_e32 v1, s11
327327
; GFX8V5-NEXT: v_mov_b32_e32 v2, s0
328328
; GFX8V5-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
329329
; GFX8V5-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll

Lines changed: 25 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -513,16 +513,16 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
513513
; GFX908-LABEL: introduced_copy_to_sgpr:
514514
; GFX908: ; %bb.0: ; %bb
515515
; GFX908-NEXT: global_load_ushort v16, v[0:1], off glc
516-
; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
517516
; GFX908-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
518517
; GFX908-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
519-
; GFX908-NEXT: s_mov_b32 s8, 0
520-
; GFX908-NEXT: s_mov_b32 s13, s8
521-
; GFX908-NEXT: v_mov_b32_e32 v19, 0
518+
; GFX908-NEXT: s_load_dword s0, s[8:9], 0x18
519+
; GFX908-NEXT: s_mov_b32 s12, 0
520+
; GFX908-NEXT: s_mov_b32 s9, s12
522521
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
523522
; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s7
524523
; GFX908-NEXT: s_sub_i32 s1, 0, s7
525524
; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s0
525+
; GFX908-NEXT: v_mov_b32_e32 v19, 0
526526
; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0
527527
; GFX908-NEXT: v_mov_b32_e32 v0, 0
528528
; GFX908-NEXT: v_mov_b32_e32 v1, 0
@@ -542,14 +542,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
542542
; GFX908-NEXT: s_cselect_b32 s2, s6, s2
543543
; GFX908-NEXT: s_add_i32 s3, s1, 1
544544
; GFX908-NEXT: s_cmp_ge_u32 s2, s7
545-
; GFX908-NEXT: s_cselect_b32 s12, s3, s1
545+
; GFX908-NEXT: s_cselect_b32 s8, s3, s1
546546
; GFX908-NEXT: s_lshr_b32 s2, s0, 16
547547
; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2
548548
; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
549549
; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
550550
; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
551551
; GFX908-NEXT: s_or_b32 s14, s14, 28
552-
; GFX908-NEXT: s_lshl_b64 s[16:17], s[12:13], 5
552+
; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
553553
; GFX908-NEXT: s_waitcnt vmcnt(0)
554554
; GFX908-NEXT: v_readfirstlane_b32 s2, v16
555555
; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
@@ -573,15 +573,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
573573
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
574574
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
575575
; GFX908-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
576-
; GFX908-NEXT: s_mov_b32 s9, s8
576+
; GFX908-NEXT: s_mov_b32 s13, s12
577577
; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[2:3]
578-
; GFX908-NEXT: v_mov_b32_e32 v4, s8
578+
; GFX908-NEXT: v_mov_b32_e32 v4, s12
579579
; GFX908-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v6
580-
; GFX908-NEXT: v_mov_b32_e32 v6, s8
581-
; GFX908-NEXT: v_mov_b32_e32 v8, s8
582-
; GFX908-NEXT: v_mov_b32_e32 v5, s9
583-
; GFX908-NEXT: v_mov_b32_e32 v7, s9
584-
; GFX908-NEXT: v_mov_b32_e32 v9, s9
580+
; GFX908-NEXT: v_mov_b32_e32 v6, s12
581+
; GFX908-NEXT: v_mov_b32_e32 v8, s12
582+
; GFX908-NEXT: v_mov_b32_e32 v5, s13
583+
; GFX908-NEXT: v_mov_b32_e32 v7, s13
584+
; GFX908-NEXT: v_mov_b32_e32 v9, s13
585585
; GFX908-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
586586
; GFX908-NEXT: v_mov_b32_e32 v11, v5
587587
; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
@@ -667,7 +667,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
667667
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
668668
; GFX908-NEXT: ; %bb.11: ; %bb12
669669
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
670-
; GFX908-NEXT: s_add_u32 s10, s10, s12
670+
; GFX908-NEXT: s_add_u32 s10, s10, s8
671671
; GFX908-NEXT: s_addc_u32 s11, s11, 0
672672
; GFX908-NEXT: s_add_u32 s14, s14, s16
673673
; GFX908-NEXT: s_addc_u32 s15, s15, s17
@@ -679,15 +679,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
679679
; GFX90A-LABEL: introduced_copy_to_sgpr:
680680
; GFX90A: ; %bb.0: ; %bb
681681
; GFX90A-NEXT: global_load_ushort v18, v[0:1], off glc
682-
; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
683682
; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
684683
; GFX90A-NEXT: s_load_dwordx2 s[10:11], s[8:9], 0x10
685-
; GFX90A-NEXT: s_mov_b32 s8, 0
686-
; GFX90A-NEXT: s_mov_b32 s13, s8
687-
; GFX90A-NEXT: v_mov_b32_e32 v19, 0
684+
; GFX90A-NEXT: s_load_dword s0, s[8:9], 0x18
685+
; GFX90A-NEXT: s_mov_b32 s12, 0
686+
; GFX90A-NEXT: s_mov_b32 s9, s12
688687
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
689688
; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s7
690689
; GFX90A-NEXT: s_sub_i32 s1, 0, s7
690+
; GFX90A-NEXT: v_mov_b32_e32 v19, 0
691691
; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0
692692
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0
693693
; GFX90A-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
@@ -707,14 +707,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
707707
; GFX90A-NEXT: s_cselect_b32 s2, s6, s2
708708
; GFX90A-NEXT: s_add_i32 s3, s1, 1
709709
; GFX90A-NEXT: s_cmp_ge_u32 s2, s7
710-
; GFX90A-NEXT: s_cselect_b32 s12, s3, s1
710+
; GFX90A-NEXT: s_cselect_b32 s8, s3, s1
711711
; GFX90A-NEXT: s_lshr_b32 s2, s0, 16
712712
; GFX90A-NEXT: v_cvt_f32_f16_e32 v3, s2
713713
; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5
714714
; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5
715715
; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
716716
; GFX90A-NEXT: s_or_b32 s14, s14, 28
717-
; GFX90A-NEXT: s_lshl_b64 s[16:17], s[12:13], 5
717+
; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5
718718
; GFX90A-NEXT: s_waitcnt vmcnt(0)
719719
; GFX90A-NEXT: v_readfirstlane_b32 s2, v18
720720
; GFX90A-NEXT: s_and_b32 s2, 0xffff, s2
@@ -738,12 +738,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
738738
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
739739
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
740740
; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
741-
; GFX90A-NEXT: s_mov_b32 s9, s8
741+
; GFX90A-NEXT: s_mov_b32 s13, s12
742742
; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
743-
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
743+
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
744744
; GFX90A-NEXT: v_cmp_ne_u32_e64 s[2:3], 1, v8
745-
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
746-
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
745+
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
746+
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
747747
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
748748
; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15]
749749
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
@@ -821,7 +821,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
821821
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
822822
; GFX90A-NEXT: ; %bb.11: ; %bb12
823823
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
824-
; GFX90A-NEXT: s_add_u32 s10, s10, s12
824+
; GFX90A-NEXT: s_add_u32 s10, s10, s8
825825
; GFX90A-NEXT: s_addc_u32 s11, s11, 0
826826
; GFX90A-NEXT: s_add_u32 s14, s14, s16
827827
; GFX90A-NEXT: s_addc_u32 s15, s15, s17

0 commit comments

Comments
 (0)