Skip to content

Commit 7c88d13

Browse files
committed
[AMDGPU] Prefer SplitVectorLoad/Store over expandUnalignedLoad/Store
ExpandUnalignedLoad/Store can sometimes produce unnecessary copies to temporary stack slot. We should prefer splitting vectors if possible. Differential Revision: https://reviews.llvm.org/D88882
1 parent 380087e commit 7c88d13

File tree

2 files changed

+54
-106
lines changed

2 files changed

+54
-106
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8014,13 +8014,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
80148014
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
80158015
"Custom lowering for non-i32 vectors hasn't been implemented.");
80168016

8017-
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8018-
MemVT, *Load->getMemOperand())) {
8019-
SDValue Ops[2];
8020-
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
8021-
return DAG.getMergeValues(Ops, DL);
8022-
}
8023-
80248017
unsigned Alignment = Load->getAlignment();
80258018
unsigned AS = Load->getAddressSpace();
80268019
if (Subtarget->hasLDSMisalignedBug() &&
@@ -8132,6 +8125,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
81328125
return SplitVectorLoad(Op, DAG);
81338126
}
81348127
}
8128+
8129+
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8130+
MemVT, *Load->getMemOperand())) {
8131+
SDValue Ops[2];
8132+
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
8133+
return DAG.getMergeValues(Ops, DL);
8134+
}
8135+
81358136
return SDValue();
81368137
}
81378138

@@ -8537,11 +8538,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
85378538
assert(VT.isVector() &&
85388539
Store->getValue().getValueType().getScalarType() == MVT::i32);
85398540

8540-
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8541-
VT, *Store->getMemOperand())) {
8542-
return expandUnalignedStore(Store, DAG);
8543-
}
8544-
85458541
unsigned AS = Store->getAddressSpace();
85468542
if (Subtarget->hasLDSMisalignedBug() &&
85478543
AS == AMDGPUAS::FLAT_ADDRESS &&
@@ -8566,6 +8562,11 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
85668562
// v3 stores not supported on SI.
85678563
if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
85688564
return SplitVectorStore(Op, DAG);
8565+
8566+
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8567+
VT, *Store->getMemOperand()))
8568+
return expandUnalignedStore(Store, DAG);
8569+
85698570
return SDValue();
85708571
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
85718572
switch (Subtarget->getMaxPrivateElementSize()) {
@@ -8605,6 +8606,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
86058606
return SplitVectorStore(Op, DAG);
86068607
}
86078608

8609+
if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
8610+
VT, *Store->getMemOperand())) {
8611+
if (VT.isVector())
8612+
return SplitVectorStore(Op, DAG);
8613+
return expandUnalignedStore(Store, DAG);
8614+
}
8615+
86088616
return SDValue();
86098617
} else {
86108618
llvm_unreachable("unhandled address space");

llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll

Lines changed: 34 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -7,35 +7,15 @@
77
define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %arg2) {
88
; CHECK-LABEL: test:
99
; CHECK: ; %bb.0:
10-
; CHECK-NEXT: s_mov_b32 s8, s4
11-
; CHECK-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0
12-
; CHECK-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1
13-
; CHECK-NEXT: s_mov_b32 s6, -1
14-
; CHECK-NEXT: s_mov_b32 s7, 0xe8f000
15-
; CHECK-NEXT: s_add_u32 s4, s4, s8
16-
; CHECK-NEXT: s_addc_u32 s5, s5, 0
10+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v0
1711
; CHECK-NEXT: v_add_i32_e32 v1, vcc, 8, v0
18-
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 12, v0
12+
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v0
1913
; CHECK-NEXT: s_mov_b32 m0, -1
20-
; CHECK-NEXT: ds_read_b32 v1, v1
21-
; CHECK-NEXT: ds_read_b32 v2, v2
22-
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 4, v0
14+
; CHECK-NEXT: ds_read_b32 v2, v1
15+
; CHECK-NEXT: ds_read_b32 v1, v4
2316
; CHECK-NEXT: ds_read_b32 v3, v3
2417
; CHECK-NEXT: ds_read_b32 v0, v0
25-
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
26-
; CHECK-NEXT: buffer_store_dword v2, off, s[4:7], 0 offset:28
27-
; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0 offset:24
28-
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
29-
; CHECK-NEXT: buffer_store_dword v3, off, s[4:7], 0 offset:20
3018
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
31-
; CHECK-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:16
32-
; CHECK-NEXT: s_waitcnt expcnt(1)
33-
; CHECK-NEXT: buffer_load_dword v3, off, s[4:7], 0 offset:28
34-
; CHECK-NEXT: buffer_load_dword v2, off, s[4:7], 0 offset:24
35-
; CHECK-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:20
36-
; CHECK-NEXT: s_waitcnt expcnt(0)
37-
; CHECK-NEXT: buffer_load_dword v0, off, s[4:7], 0 offset:16
38-
; CHECK-NEXT: s_waitcnt vmcnt(0)
3919
; CHECK-NEXT: exp mrt0 off, off, off, off
4020
; CHECK-NEXT: v_mov_b32_e32 v4, 0
4121
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
@@ -50,42 +30,25 @@ define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %ar
5030
define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) {
5131
; CHECK-LABEL: test_2:
5232
; CHECK: ; %bb.0:
53-
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
54-
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
55-
; CHECK-NEXT: s_mov_b32 s10, -1
56-
; CHECK-NEXT: s_mov_b32 s11, 0xe8f000
57-
; CHECK-NEXT: s_add_u32 s8, s8, s5
58-
; CHECK-NEXT: s_addc_u32 s9, s9, 0
33+
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 28, v1
5934
; CHECK-NEXT: v_add_i32_e32 v2, vcc, 24, v1
60-
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 28, v1
35+
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 20, v1
6136
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 16, v1
62-
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 20, v1
37+
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 12, v1
6338
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 8, v1
64-
; CHECK-NEXT: v_add_i32_e32 v9, vcc, 12, v1
6539
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 4, v1
6640
; CHECK-NEXT: s_mov_b32 m0, -1
6741
; CHECK-NEXT: ds_read_b32 v4, v2
68-
; CHECK-NEXT: ds_read_b32 v5, v3
42+
; CHECK-NEXT: ds_read_b32 v3, v3
6943
; CHECK-NEXT: ds_read_b32 v2, v6
70-
; CHECK-NEXT: ds_read_b32 v3, v7
44+
; CHECK-NEXT: ds_read_b32 v9, v7
7145
; CHECK-NEXT: ds_read_b32 v8, v8
72-
; CHECK-NEXT: ds_read_b32 v9, v9
7346
; CHECK-NEXT: ds_read_b32 v7, v10
7447
; CHECK-NEXT: ds_read_b32 v6, v1
75-
; CHECK-NEXT: s_waitcnt lgkmcnt(6)
76-
; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:28
77-
; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:24
78-
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
79-
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20
80-
; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16
81-
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
82-
; CHECK-NEXT: buffer_store_dword v9, off, s[8:11], 0 offset:44
83-
; CHECK-NEXT: buffer_store_dword v8, off, s[8:11], 0 offset:40
48+
; CHECK-NEXT: ds_read_b32 v5, v5
8449
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
85-
; CHECK-NEXT: buffer_store_dword v7, off, s[8:11], 0 offset:36
86-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
87-
; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:32
8850
; CHECK-NEXT: tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
51+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
8952
; CHECK-NEXT: tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
9053
; CHECK-NEXT: s_endpgm
9154
%load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4
@@ -99,65 +62,42 @@ define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3,
9962
define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, <6 x float> addrspace(3)* %arg5, <6 x float> addrspace(3)* %arg6) {
10063
; CHECK-LABEL: test_3:
10164
; CHECK: ; %bb.0:
102-
; CHECK-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
103-
; CHECK-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
104-
; CHECK-NEXT: s_mov_b32 s10, -1
105-
; CHECK-NEXT: s_mov_b32 s11, 0xe8f000
106-
; CHECK-NEXT: s_add_u32 s8, s8, s6
107-
; CHECK-NEXT: s_addc_u32 s9, s9, 0
10865
; CHECK-NEXT: s_mov_b32 s7, s5
10966
; CHECK-NEXT: s_mov_b32 s6, s4
11067
; CHECK-NEXT: s_mov_b32 s5, s3
11168
; CHECK-NEXT: s_mov_b32 s4, s2
112-
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 8, v1
113-
; CHECK-NEXT: v_add_i32_e32 v3, vcc, 12, v1
114-
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 4, v1
115-
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 16, v1
69+
; CHECK-NEXT: v_add_i32_e32 v0, vcc, 16, v1
70+
; CHECK-NEXT: v_add_i32_e32 v6, vcc, 12, v1
71+
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 8, v1
72+
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v1
11673
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v1
11774
; CHECK-NEXT: v_mov_b32_e32 v9, s0
118-
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 8, v2
75+
; CHECK-NEXT: v_add_i32_e32 v10, vcc, 16, v2
11976
; CHECK-NEXT: v_add_i32_e32 v11, vcc, 12, v2
120-
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 4, v2
121-
; CHECK-NEXT: v_add_i32_e32 v13, vcc, 16, v2
122-
; CHECK-NEXT: v_add_i32_e32 v14, vcc, 20, v2
77+
; CHECK-NEXT: v_add_i32_e32 v12, vcc, 8, v2
12378
; CHECK-NEXT: s_mov_b32 m0, -1
124-
; CHECK-NEXT: ds_read_b32 v5, v0
125-
; CHECK-NEXT: ds_read_b32 v6, v3
126-
; CHECK-NEXT: ds_read_b32 v4, v4
127-
; CHECK-NEXT: ds_read_b32 v8, v8
128-
; CHECK-NEXT: ds_read_b32 v7, v7
12979
; CHECK-NEXT: ds_read_b32 v3, v1
130-
; CHECK-NEXT: s_waitcnt lgkmcnt(4)
131-
; CHECK-NEXT: buffer_store_dword v6, off, s[8:11], 0 offset:44
132-
; CHECK-NEXT: buffer_store_dword v5, off, s[8:11], 0 offset:40
133-
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
134-
; CHECK-NEXT: buffer_store_dword v4, off, s[8:11], 0 offset:36
135-
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
136-
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:32
80+
; CHECK-NEXT: ds_read_b32 v5, v4
81+
; CHECK-NEXT: ds_read_b32 v4, v7
82+
; CHECK-NEXT: ds_read_b32 v1, v8
83+
; CHECK-NEXT: ds_read_b32 v6, v6
84+
; CHECK-NEXT: ds_read_b32 v0, v0
85+
; CHECK-NEXT: v_add_i32_e32 v7, vcc, 4, v2
86+
; CHECK-NEXT: v_add_i32_e32 v8, vcc, 20, v2
87+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
13788
; CHECK-NEXT: tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
138-
; CHECK-NEXT: tbuffer_store_format_xy v[7:8], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
139-
; CHECK-NEXT: ds_read_b32 v0, v10
140-
; CHECK-NEXT: ds_read_b32 v1, v11
141-
; CHECK-NEXT: s_waitcnt expcnt(1)
142-
; CHECK-NEXT: ds_read_b32 v3, v12
143-
; CHECK-NEXT: ds_read_b32 v4, v13
144-
; CHECK-NEXT: ds_read_b32 v2, v2
145-
; CHECK-NEXT: s_waitcnt lgkmcnt(3)
146-
; CHECK-NEXT: buffer_store_dword v1, off, s[8:11], 0 offset:28
147-
; CHECK-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:24
148-
; CHECK-NEXT: s_waitcnt lgkmcnt(2)
149-
; CHECK-NEXT: buffer_store_dword v3, off, s[8:11], 0 offset:20
15089
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
151-
; CHECK-NEXT: buffer_store_dword v2, off, s[8:11], 0 offset:16
152-
; CHECK-NEXT: s_waitcnt expcnt(1)
153-
; CHECK-NEXT: buffer_load_dword v3, off, s[8:11], 0 offset:28
90+
; CHECK-NEXT: tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
15491
; CHECK-NEXT: s_waitcnt expcnt(0)
155-
; CHECK-NEXT: buffer_load_dword v2, off, s[8:11], 0 offset:24
156-
; CHECK-NEXT: buffer_load_dword v1, off, s[8:11], 0 offset:20
157-
; CHECK-NEXT: buffer_load_dword v0, off, s[8:11], 0 offset:16
158-
; CHECK-NEXT: ds_read_b32 v5, v14
159-
; CHECK-NEXT: s_waitcnt vmcnt(0)
92+
; CHECK-NEXT: ds_read_b32 v0, v2
93+
; CHECK-NEXT: ds_read_b32 v2, v12
94+
; CHECK-NEXT: ds_read_b32 v1, v7
95+
; CHECK-NEXT: ds_read_b32 v5, v8
96+
; CHECK-NEXT: ds_read_b32 v3, v11
97+
; CHECK-NEXT: ds_read_b32 v4, v10
98+
; CHECK-NEXT: s_waitcnt lgkmcnt(5)
16099
; CHECK-NEXT: exp mrt0 off, off, off, off
100+
; CHECK-NEXT: s_waitcnt lgkmcnt(1)
161101
; CHECK-NEXT: tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
162102
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
163103
; CHECK-NEXT: tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc

0 commit comments

Comments
 (0)