Skip to content

Commit e5f499f

Browse files
authored
DAG: Allow select ptr combine for non-0 address spaces (#167909)
1 parent c7019c7 commit e5f499f

File tree

12 files changed

+238
-221
lines changed

12 files changed

+238
-221
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29031,9 +29031,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
2903129031
// over-conservative. It would be beneficial to be able to remember
2903229032
// both potential memory locations. Since we are discarding
2903329033
// src value info, don't do the transformation if the memory
29034-
// locations are not in the default address space.
29035-
LLD->getPointerInfo().getAddrSpace() != 0 ||
29036-
RLD->getPointerInfo().getAddrSpace() != 0 ||
29034+
// locations are not in the same address space.
29035+
LLD->getPointerInfo().getAddrSpace() !=
29036+
RLD->getPointerInfo().getAddrSpace() ||
2903729037
// We can't produce a CMOV of a TargetFrameIndex since we won't
2903829038
// generate the address generation required.
2903929039
LLD->getBasePtr().getOpcode() == ISD::TargetFrameIndex ||
@@ -29115,6 +29115,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
2911529115
// but the new load must be the minimum (most restrictive) alignment of the
2911629116
// inputs.
2911729117
Align Alignment = std::min(LLD->getAlign(), RLD->getAlign());
29118+
unsigned AddrSpace = LLD->getAddressSpace();
29119+
assert(AddrSpace == RLD->getAddressSpace());
29120+
2911829121
MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
2911929122
if (!RLD->isInvariant())
2912029123
MMOFlags &= ~MachineMemOperand::MOInvariant;
@@ -29123,15 +29126,16 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
2912329126
if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
2912429127
// FIXME: Discards pointer and AA info.
2912529128
Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
29126-
LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
29127-
MMOFlags);
29129+
LLD->getChain(), Addr, MachinePointerInfo(AddrSpace),
29130+
Alignment, MMOFlags);
2912829131
} else {
2912929132
// FIXME: Discards pointer and AA info.
2913029133
Load = DAG.getExtLoad(
2913129134
LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
2913229135
: LLD->getExtensionType(),
2913329136
SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
29134-
MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
29137+
MachinePointerInfo(AddrSpace), LLD->getMemoryVT(), Alignment,
29138+
MMOFlags);
2913529139
}
2913629140

2913729141
// Users of the select now use the result of the load.

llvm/test/CodeGen/AMDGPU/load-select-ptr.ll

Lines changed: 49 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -7,27 +7,31 @@
77
define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %ptr0, [8 x i32], ptr %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
88
; GCN-LABEL: select_ptr_crash_i64_flat:
99
; GCN: ; %bb.0:
10-
; GCN-NEXT: s_load_dword s6, s[8:9], 0x0
11-
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28
12-
; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50
13-
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78
1410
; GCN-NEXT: s_add_i32 s12, s12, s17
1511
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
12+
; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
13+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x78
14+
; GCN-NEXT: s_add_u32 s4, s8, 40
15+
; GCN-NEXT: s_addc_u32 s3, s9, 0
16+
; GCN-NEXT: s_add_u32 s5, s8, 0x50
17+
; GCN-NEXT: s_addc_u32 s6, s9, 0
1618
; GCN-NEXT: s_waitcnt lgkmcnt(0)
17-
; GCN-NEXT: s_cmp_eq_u32 s6, 0
18-
; GCN-NEXT: s_cselect_b32 s0, s0, s2
19-
; GCN-NEXT: s_cselect_b32 s1, s1, s3
20-
; GCN-NEXT: v_mov_b32_e32 v0, s0
21-
; GCN-NEXT: v_mov_b32_e32 v1, s1
22-
; GCN-NEXT: s_add_u32 s0, s0, 4
19+
; GCN-NEXT: s_cmp_eq_u32 s2, 0
20+
; GCN-NEXT: s_cselect_b32 s3, s3, s6
21+
; GCN-NEXT: s_cselect_b32 s2, s4, s5
22+
; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2323
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
24-
; GCN-NEXT: s_addc_u32 s1, s1, 0
24+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
25+
; GCN-NEXT: v_mov_b32_e32 v0, s2
26+
; GCN-NEXT: v_mov_b32_e32 v1, s3
27+
; GCN-NEXT: s_add_u32 s2, s2, 4
2528
; GCN-NEXT: flat_load_dword v0, v[0:1]
26-
; GCN-NEXT: v_mov_b32_e32 v2, s1
27-
; GCN-NEXT: v_mov_b32_e32 v1, s0
29+
; GCN-NEXT: s_addc_u32 s3, s3, 0
30+
; GCN-NEXT: v_mov_b32_e32 v1, s2
31+
; GCN-NEXT: v_mov_b32_e32 v2, s3
2832
; GCN-NEXT: flat_load_dword v1, v[1:2]
29-
; GCN-NEXT: v_mov_b32_e32 v2, s4
30-
; GCN-NEXT: v_mov_b32_e32 v3, s5
33+
; GCN-NEXT: v_mov_b32_e32 v3, s1
34+
; GCN-NEXT: v_mov_b32_e32 v2, s0
3135
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3236
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
3337
; GCN-NEXT: s_endpgm
@@ -45,25 +49,28 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], ptr %p
4549
define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], ptr addrspace(1) %ptr0, [8 x i32], ptr addrspace(1) %ptr1, [8 x i32], ptr addrspace(1) %ptr2) {
4650
; GCN-LABEL: select_ptr_crash_i64_global:
4751
; GCN: ; %bb.0:
48-
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x28
49-
; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x50
50-
; GCN-NEXT: s_load_dword s6, s[8:9], 0x0
51-
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x78
5252
; GCN-NEXT: s_add_i32 s12, s12, s17
5353
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
54+
; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
55+
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x78
56+
; GCN-NEXT: s_add_u32 s4, s8, 40
57+
; GCN-NEXT: s_addc_u32 s3, s9, 0
58+
; GCN-NEXT: s_add_u32 s5, s8, 0x50
59+
; GCN-NEXT: s_addc_u32 s6, s9, 0
5460
; GCN-NEXT: s_waitcnt lgkmcnt(0)
55-
; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
61+
; GCN-NEXT: s_cmp_eq_u32 s2, 0
62+
; GCN-NEXT: s_cselect_b32 s3, s3, s6
63+
; GCN-NEXT: s_cselect_b32 s2, s4, s5
5664
; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
57-
; GCN-NEXT: s_cmp_eq_u32 s6, 0
58-
; GCN-NEXT: v_mov_b32_e32 v2, s4
59-
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
60-
; GCN-NEXT: v_mov_b32_e32 v3, s5
61-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
62-
; GCN-NEXT: s_cselect_b32 s1, s1, s3
63-
; GCN-NEXT: s_cselect_b32 s0, s0, s2
6465
; GCN-NEXT: v_mov_b32_e32 v0, s0
66+
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
6567
; GCN-NEXT: v_mov_b32_e32 v1, s1
66-
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
68+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
69+
; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
70+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
71+
; GCN-NEXT: v_mov_b32_e32 v2, s2
72+
; GCN-NEXT: v_mov_b32_e32 v3, s3
73+
; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
6774
; GCN-NEXT: s_endpgm
6875
%tmp2 = icmp eq i32 %tmp, 0
6976
%tmp3 = load i64, ptr addrspace(1) %ptr0, align 8
@@ -78,22 +85,18 @@ define amdgpu_kernel void @select_ptr_crash_i64_local(i32 %tmp, ptr addrspace(3)
7885
; GCN: ; %bb.0:
7986
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
8087
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
81-
; GCN-NEXT: s_mov_b32 m0, -1
8288
; GCN-NEXT: s_add_i32 s12, s12, s17
8389
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
90+
; GCN-NEXT: s_mov_b32 m0, -1
8491
; GCN-NEXT: s_waitcnt lgkmcnt(0)
85-
; GCN-NEXT: v_mov_b32_e32 v0, s1
86-
; GCN-NEXT: v_mov_b32_e32 v2, s2
87-
; GCN-NEXT: ds_read_b64 v[0:1], v0
88-
; GCN-NEXT: ds_read_b64 v[2:3], v2
8992
; GCN-NEXT: s_cmp_eq_u32 s0, 0
90-
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
91-
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
92-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
93-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
94-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
93+
; GCN-NEXT: s_cselect_b32 s0, s1, s2
94+
; GCN-NEXT: v_mov_b32_e32 v0, s0
95+
; GCN-NEXT: ds_read_b64 v[0:1], v0
9596
; GCN-NEXT: v_mov_b32_e32 v2, s4
97+
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
9698
; GCN-NEXT: v_mov_b32_e32 v3, s5
99+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
97100
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
98101
; GCN-NEXT: s_endpgm
99102
%tmp2 = icmp eq i32 %tmp, 0
@@ -112,22 +115,20 @@ define amdgpu_kernel void @select_ptr_crash_i64_local_offsets(i32 %tmp, ptr addr
112115
; GCN: ; %bb.0:
113116
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
114117
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
115-
; GCN-NEXT: s_mov_b32 m0, -1
116118
; GCN-NEXT: s_add_i32 s12, s12, s17
117119
; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
120+
; GCN-NEXT: s_mov_b32 m0, -1
118121
; GCN-NEXT: s_waitcnt lgkmcnt(0)
119-
; GCN-NEXT: v_mov_b32_e32 v0, s1
120-
; GCN-NEXT: v_mov_b32_e32 v2, s2
121-
; GCN-NEXT: ds_read_b64 v[0:1], v0 offset:128
122-
; GCN-NEXT: ds_read_b64 v[2:3], v2 offset:512
122+
; GCN-NEXT: s_addk_i32 s1, 0x80
123+
; GCN-NEXT: s_addk_i32 s2, 0x200
123124
; GCN-NEXT: s_cmp_eq_u32 s0, 0
124-
; GCN-NEXT: s_cselect_b64 vcc, -1, 0
125-
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
126-
; GCN-NEXT: s_waitcnt lgkmcnt(0)
127-
; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
128-
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
125+
; GCN-NEXT: s_cselect_b32 s0, s1, s2
126+
; GCN-NEXT: v_mov_b32_e32 v0, s0
127+
; GCN-NEXT: ds_read_b64 v[0:1], v0
129128
; GCN-NEXT: v_mov_b32_e32 v2, s4
129+
; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
130130
; GCN-NEXT: v_mov_b32_e32 v3, s5
131+
; GCN-NEXT: s_waitcnt lgkmcnt(0)
131132
; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
132133
; GCN-NEXT: s_endpgm
133134
%tmp2 = icmp eq i32 %tmp, 0

llvm/test/CodeGen/AMDGPU/select-load-to-load-select-ptr-combine.ll

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@ define i32 @select_load_i32_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %
2222
; CHECK-LABEL: select_load_i32_p1:
2323
; CHECK: ; %bb.0:
2424
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
25-
; CHECK-NEXT: global_load_dword v5, v[1:2], off
26-
; CHECK-NEXT: global_load_dword v6, v[3:4], off
2725
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
2826
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
27+
; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
28+
; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
29+
; CHECK-NEXT: global_load_dword v0, v[1:2], off
2930
; CHECK-NEXT: s_waitcnt vmcnt(0)
30-
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
3131
; CHECK-NEXT: s_setpc_b64 s[30:31]
3232
%ld0 = load i32, ptr addrspace(1) %a
3333
%ld1 = load i32, ptr addrspace(1) %b
@@ -39,12 +39,11 @@ define i32 @select_load_i32_p3(i1 %cond, ptr addrspace(3) %a, ptr addrspace(3) %
3939
; CHECK-LABEL: select_load_i32_p3:
4040
; CHECK: ; %bb.0:
4141
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
42-
; CHECK-NEXT: ds_read_b32 v1, v1
43-
; CHECK-NEXT: ds_read_b32 v2, v2
4442
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
4543
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
46-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
4744
; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
45+
; CHECK-NEXT: ds_read_b32 v0, v0
46+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
4847
; CHECK-NEXT: s_setpc_b64 s[30:31]
4948
%ld0 = load i32, ptr addrspace(3) %a
5049
%ld1 = load i32, ptr addrspace(3) %b
@@ -90,12 +89,12 @@ define i8 @select_load_i8_p1(i1 %cond, ptr addrspace(1) %a, ptr addrspace(1) %b)
9089
; CHECK-LABEL: select_load_i8_p1:
9190
; CHECK: ; %bb.0:
9291
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
93-
; CHECK-NEXT: global_load_ubyte v5, v[1:2], off
94-
; CHECK-NEXT: global_load_ubyte v6, v[3:4], off
9592
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
9693
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
94+
; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
95+
; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
96+
; CHECK-NEXT: global_load_ubyte v0, v[1:2], off
9797
; CHECK-NEXT: s_waitcnt vmcnt(0)
98-
; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc
9998
; CHECK-NEXT: s_setpc_b64 s[30:31]
10099
%ld0 = load i8, ptr addrspace(1) %a
101100
%ld1 = load i8, ptr addrspace(1) %b
@@ -107,12 +106,16 @@ define i32 @select_load_i32_p1_offset(i1 %cond, ptr addrspace(1) %a, ptr addrspa
107106
; CHECK-LABEL: select_load_i32_p1_offset:
108107
; CHECK: ; %bb.0:
109108
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
110-
; CHECK-NEXT: global_load_dword v3, v[1:2], off offset:256
111-
; CHECK-NEXT: global_load_dword v4, v[1:2], off offset:512
109+
; CHECK-NEXT: v_add_co_u32_e32 v3, vcc, 0x100, v1
110+
; CHECK-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v2, vcc
111+
; CHECK-NEXT: v_add_co_u32_e32 v5, vcc, 0x200, v1
112112
; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
113+
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
113114
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
115+
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
116+
; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
117+
; CHECK-NEXT: global_load_dword v0, v[0:1], off
114118
; CHECK-NEXT: s_waitcnt vmcnt(0)
115-
; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
116119
; CHECK-NEXT: s_setpc_b64 s[30:31]
117120
%gep.a = getelementptr i8, ptr addrspace(1) %a, i64 256
118121
%gep.b = getelementptr i8, ptr addrspace(1) %a, i64 512

0 commit comments

Comments
 (0)