Skip to content

Commit e987628

Browse files
committed
transform alloca only
1 parent 40be8d5 commit e987628

File tree

2 files changed

+92
-104
lines changed

2 files changed

+92
-104
lines changed

llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,14 @@ static Type *getIntOrVecTypeForSize(uint64_t NBytes, LLVMContext &Ctx,
142142

143143
static Align minAlign(Align A, Align B) { return A < B ? A : B; }
144144

145+
// Checks if the underlying object of a memcpy operand is an alloca.
146+
// This helps focus on scratch memory optimizations by filtering out
147+
// memcpy operations that don't involve stack-allocated memory.
148+
static bool hasAllocaUnderlyingObject(Value *V) {
149+
Value *Underlying = getUnderlyingObject(V);
150+
return isa<AllocaInst>(Underlying);
151+
}
152+
145153
// Checks if both pointer operands can be speculatively loaded for N bytes and
146154
// computes the minimum alignment to use.
147155
// Notes:
@@ -493,6 +501,10 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
493501
<< " - srcIsSelect=" << (isa<SelectInst>(SrcV) ? "true" : "false")
494502
<< '\n'
495503
<< " - dstIsSelect=" << (isa<SelectInst>(DstV) ? "true" : "false")
504+
<< '\n'
505+
<< " - srcIsAlloca=" << (hasAllocaUnderlyingObject(SrcV) ? "true" : "false")
506+
<< '\n'
507+
<< " - dstIsAlloca=" << (hasAllocaUnderlyingObject(DstV) ? "true" : "false")
496508
<< '\n';
497509

498510
dumpSelect("src", SrcV);
@@ -526,6 +538,15 @@ AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) {
526538
continue;
527539
}
528540

541+
// Focus on alloca-based memcpy operations to reduce scratch usage
542+
bool DstIsAlloca = hasAllocaUnderlyingObject(Dst);
543+
bool SrcIsAlloca = hasAllocaUnderlyingObject(Src);
544+
if (!DstIsAlloca && !SrcIsAlloca) {
545+
LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: neither source nor "
546+
<< "destination underlying object is alloca\n");
547+
continue;
548+
}
549+
529550
if (auto *Sel = dyn_cast<SelectInst>(Src)) {
530551
Changed |= Impl.transformSelectMemcpySource(*MT, *Sel, DL, &DT, &AC);
531552
continue;

llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll

Lines changed: 71 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -9,35 +9,38 @@
99
@G0 = addrspace(1) global [4 x i32] zeroinitializer, align 16
1010
@G1 = addrspace(1) global [4 x i32] zeroinitializer, align 16
1111

12-
declare void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) nocapture writeonly, ptr addrspace(1) nocapture readonly, i64, i1 immarg)
12+
declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg)
13+
declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg)
1314

1415
; -----------------------------------------------------------------------------
1516
; Source is a select. Expect value-level select of two <4 x i32> loads
1617
; and a single store, with no remaining memcpy.
1718
;
18-
define amdgpu_kernel void @value_select_src(ptr addrspace(1) %dst, i1 %cond) {
19+
define amdgpu_kernel void @value_select_src(i1 %cond) {
1920
; CHECK-LABEL: define amdgpu_kernel void @value_select_src(
20-
; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
21+
; CHECK-SAME: i1 [[COND:%.*]]) {
2122
; CHECK-NEXT: [[ENTRY:.*:]]
22-
; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @G0, i64 0, i64 0
23-
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @G1, i64 0, i64 0
24-
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[A]], ptr addrspace(1) [[B]]
25-
; CHECK-NEXT: [[LA:%.*]] = load <4 x i32>, ptr addrspace(1) [[A]], align 16
26-
; CHECK-NEXT: [[LB:%.*]] = load <4 x i32>, ptr addrspace(1) [[B]], align 16
23+
; CHECK-NEXT: [[PA:%.*]] = alloca [4 x i32], align 16, addrspace(5)
24+
; CHECK-NEXT: [[PB:%.*]] = alloca [4 x i32], align 16, addrspace(5)
25+
; CHECK-NEXT: [[DST:%.*]] = alloca [4 x i32], align 16, addrspace(5)
26+
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]]
27+
; CHECK-NEXT: [[LA:%.*]] = load <4 x i32>, ptr addrspace(5) [[PA]], align 16
28+
; CHECK-NEXT: [[LB:%.*]] = load <4 x i32>, ptr addrspace(5) [[PB]], align 16
2729
; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], <4 x i32> [[LA]], <4 x i32> [[LB]]
28-
; CHECK-NEXT: store <4 x i32> [[SEL]], ptr addrspace(1) [[DST]], align 16
30+
; CHECK-NEXT: store <4 x i32> [[SEL]], ptr addrspace(5) [[DST]], align 16
2931
; CHECK-NEXT: ret void
3032
;
3133
entry:
32-
; Pointers to two 16-byte aligned buffers in the same addrspace(1).
33-
%pa = getelementptr inbounds [4 x i32], ptr addrspace(1) @G0, i64 0, i64 0
34-
%pb = getelementptr inbounds [4 x i32], ptr addrspace(1) @G1, i64 0, i64 0
35-
%src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
34+
; Pointers to two 16-byte aligned buffers using alloca.
35+
%pa = alloca [4 x i32], align 16, addrspace(5)
36+
%pb = alloca [4 x i32], align 16, addrspace(5)
37+
%dst = alloca [4 x i32], align 16, addrspace(5)
38+
%src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb
3639

3740
; Provide explicit operand alignments so the pass can emit an aligned store.
38-
call void @llvm.memcpy.p1.p1.i64(
39-
ptr addrspace(1) align 16 %dst,
40-
ptr addrspace(1) align 16 %src,
41+
call void @llvm.memcpy.p5.p5.i64(
42+
ptr addrspace(5) align 16 %dst,
43+
ptr addrspace(5) align 16 %src,
4144
i64 16, i1 false)
4245

4346
ret void
@@ -47,25 +50,30 @@ entry:
4750
; Destination is a select. Expect CFG split with two memcpys guarded
4851
; by a branch (we do not speculate stores in this pass).
4952
;
50-
define amdgpu_kernel void @dest_select_cfg_split(ptr addrspace(1) %da, ptr addrspace(1) %db,
53+
define amdgpu_kernel void @dest_select_cfg_split(i1 %cond) {
5154
; CHECK-LABEL: define amdgpu_kernel void @dest_select_cfg_split(
52-
; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
55+
; CHECK-SAME: i1 [[COND:%.*]]) {
5356
; CHECK-NEXT: [[ENTRY:.*:]]
54-
; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]]
57+
; CHECK-NEXT: [[DA:%.*]] = alloca [4 x i32], align 16, addrspace(5)
58+
; CHECK-NEXT: [[DB:%.*]] = alloca [4 x i32], align 16, addrspace(5)
59+
; CHECK-NEXT: [[SRC:%.*]] = alloca [4 x i32], align 16, addrspace(5)
60+
; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(5) [[DA]], ptr addrspace(5) [[DB]]
5561
; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
5662
; CHECK: [[MEMCPY_JOIN:.*]]:
5763
; CHECK-NEXT: ret void
5864
; CHECK: [[MEMCPY_THEN]]:
59-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DA]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
65+
; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) [[DA]], ptr addrspace(5) [[SRC]], i64 16, i1 false)
6066
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
6167
; CHECK: [[MEMCPY_ELSE]]:
62-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DB]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
68+
; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) [[DB]], ptr addrspace(5) [[SRC]], i64 16, i1 false)
6369
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
6470
;
65-
ptr addrspace(1) %src, i1 %cond) {
6671
entry:
67-
%dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db
68-
call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false)
72+
%da = alloca [4 x i32], align 16, addrspace(5)
73+
%db = alloca [4 x i32], align 16, addrspace(5)
74+
%src = alloca [4 x i32], align 16, addrspace(5)
75+
%dst = select i1 %cond, ptr addrspace(5) %da, ptr addrspace(5) %db
76+
call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) %dst, ptr addrspace(5) %src, i64 16, i1 false)
6977
ret void
7078
}
7179

@@ -75,27 +83,29 @@ entry:
7583
;
7684
@G2 = addrspace(1) global [4 x double] zeroinitializer, align 32
7785
@G3 = addrspace(1) global [4 x double] zeroinitializer, align 32
78-
define amdgpu_kernel void @value_select_src_4xd(ptr addrspace(1) %dst, i1 %cond) {
86+
define amdgpu_kernel void @value_select_src_4xd(i1 %cond) {
7987
; CHECK-LABEL: define amdgpu_kernel void @value_select_src_4xd(
80-
; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
88+
; CHECK-SAME: i1 [[COND:%.*]]) {
8189
; CHECK-NEXT: [[ENTRY:.*:]]
82-
; CHECK-NEXT: [[PA:%.*]] = getelementptr inbounds [4 x double], ptr addrspace(1) @G2, i64 0, i64 0
83-
; CHECK-NEXT: [[PB:%.*]] = getelementptr inbounds [4 x double], ptr addrspace(1) @G3, i64 0, i64 0
84-
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
85-
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr addrspace(1) [[PA]], align 32
86-
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[PB]], align 32
90+
; CHECK-NEXT: [[PA:%.*]] = alloca [4 x double], align 32, addrspace(5)
91+
; CHECK-NEXT: [[PB:%.*]] = alloca [4 x double], align 32, addrspace(5)
92+
; CHECK-NEXT: [[DST:%.*]] = alloca [4 x double], align 32, addrspace(5)
93+
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]]
94+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr addrspace(5) [[PA]], align 32
95+
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(5) [[PB]], align 32
8796
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND]], <4 x i64> [[TMP0]], <4 x i64> [[TMP1]]
88-
; CHECK-NEXT: store <4 x i64> [[TMP2]], ptr addrspace(1) [[DST]], align 32
97+
; CHECK-NEXT: store <4 x i64> [[TMP2]], ptr addrspace(5) [[DST]], align 32
8998
; CHECK-NEXT: ret void
9099
;
91100
entry:
92-
%pa = getelementptr inbounds [4 x double], ptr addrspace(1) @G2, i64 0, i64 0
93-
%pb = getelementptr inbounds [4 x double], ptr addrspace(1) @G3, i64 0, i64 0
94-
%src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
101+
%pa = alloca [4 x double], align 32, addrspace(5)
102+
%pb = alloca [4 x double], align 32, addrspace(5)
103+
%dst = alloca [4 x double], align 32, addrspace(5)
104+
%src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb
95105

96-
call void @llvm.memcpy.p1.p1.i64(
97-
ptr addrspace(1) align 32 %dst,
98-
ptr addrspace(1) align 32 %src,
106+
call void @llvm.memcpy.p5.p5.i64(
107+
ptr addrspace(5) align 32 %dst,
108+
ptr addrspace(5) align 32 %src,
99109
i64 32, i1 false)
100110

101111
ret void
@@ -107,27 +117,29 @@ entry:
107117
;
108118
@G4 = addrspace(1) global [3 x i8] zeroinitializer, align 1
109119
@G5 = addrspace(1) global [3 x i8] zeroinitializer, align 1
110-
define amdgpu_kernel void @value_select_src_3xc(ptr addrspace(1) %dst, i1 %cond) {
120+
define amdgpu_kernel void @value_select_src_3xc(i1 %cond) {
111121
; CHECK-LABEL: define amdgpu_kernel void @value_select_src_3xc(
112-
; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
122+
; CHECK-SAME: i1 [[COND:%.*]]) {
113123
; CHECK-NEXT: [[ENTRY:.*:]]
114-
; CHECK-NEXT: [[PA:%.*]] = getelementptr inbounds [3 x i8], ptr addrspace(1) @G4, i64 0, i64 0
115-
; CHECK-NEXT: [[PB:%.*]] = getelementptr inbounds [3 x i8], ptr addrspace(1) @G5, i64 0, i64 0
116-
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
117-
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i8>, ptr addrspace(1) [[PA]], align 1
118-
; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, ptr addrspace(1) [[PB]], align 1
124+
; CHECK-NEXT: [[PA:%.*]] = alloca [3 x i8], align 1, addrspace(5)
125+
; CHECK-NEXT: [[PB:%.*]] = alloca [3 x i8], align 1, addrspace(5)
126+
; CHECK-NEXT: [[DST:%.*]] = alloca [3 x i8], align 1, addrspace(5)
127+
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]]
128+
; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i8>, ptr addrspace(5) [[PA]], align 1
129+
; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, ptr addrspace(5) [[PB]], align 1
119130
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND]], <3 x i8> [[TMP0]], <3 x i8> [[TMP1]]
120-
; CHECK-NEXT: store <3 x i8> [[TMP2]], ptr addrspace(1) [[DST]], align 1
131+
; CHECK-NEXT: store <3 x i8> [[TMP2]], ptr addrspace(5) [[DST]], align 1
121132
; CHECK-NEXT: ret void
122133
;
123134
entry:
124-
%pa = getelementptr inbounds [3 x i8], ptr addrspace(1) @G4, i64 0, i64 0
125-
%pb = getelementptr inbounds [3 x i8], ptr addrspace(1) @G5, i64 0, i64 0
126-
%src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb
135+
%pa = alloca [3 x i8], align 1, addrspace(5)
136+
%pb = alloca [3 x i8], align 1, addrspace(5)
137+
%dst = alloca [3 x i8], align 1, addrspace(5)
138+
%src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb
127139

128-
call void @llvm.memcpy.p1.p1.i64(
129-
ptr addrspace(1) align 1 %dst,
130-
ptr addrspace(1) align 1 %src,
140+
call void @llvm.memcpy.p5.p5.i64(
141+
ptr addrspace(5) align 1 %dst,
142+
ptr addrspace(5) align 1 %src,
131143
i64 3, i1 false)
132144

133145
ret void
@@ -144,10 +156,7 @@ define amdgpu_kernel void @value_select_src_constexpr_gep(ptr addrspace(1) %dst,
144156
; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) {
145157
; CHECK-NEXT: [[ENTRY:.*:]]
146158
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB
147-
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr addrspace(1) @GEPA, align 16
148-
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) @GEPB, align 16
149-
; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND]], <4 x i32> [[TMP0]], <4 x i32> [[TMP1]]
150-
; CHECK-NEXT: store <4 x i32> [[TMP2]], ptr addrspace(1) [[DST]], align 16
159+
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 16 [[DST]], ptr addrspace(1) align 16 [[SRC]], i64 16, i1 false)
151160
; CHECK-NEXT: ret void
152161
;
153162
entry:
@@ -172,15 +181,8 @@ define amdgpu_kernel void @dest_select_constexpr_gep(ptr addrspace(1) %src, i1 %
172181
; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
173182
; CHECK-NEXT: [[ENTRY:.*:]]
174183
; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB
175-
; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
176-
; CHECK: [[MEMCPY_JOIN:.*]]:
184+
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
177185
; CHECK-NEXT: ret void
178-
; CHECK: [[MEMCPY_THEN]]:
179-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GEPA, ptr addrspace(1) [[SRC]], i64 16, i1 false)
180-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
181-
; CHECK: [[MEMCPY_ELSE]]:
182-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GEPB, ptr addrspace(1) [[SRC]], i64 16, i1 false)
183-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
184186
;
185187
entry:
186188
%dst = select i1 %cond,
@@ -201,15 +203,8 @@ define amdgpu_kernel void @src_select_null_arm(ptr addrspace(1) %dst, i1 %cond)
201203
; CHECK-NEXT: [[ENTRY:.*:]]
202204
; CHECK-NEXT: [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0
203205
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) null
204-
; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
205-
; CHECK: [[MEMCPY_JOIN:.*]]:
206+
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
206207
; CHECK-NEXT: ret void
207-
; CHECK: [[MEMCPY_THEN]]:
208-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[NONNULL]], i64 16, i1 false)
209-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
210-
; CHECK: [[MEMCPY_ELSE]]:
211-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) null, i64 16, i1 false)
212-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
213208
;
214209
entry:
215210
%nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0
@@ -228,15 +223,8 @@ define amdgpu_kernel void @dst_select_null_arm(ptr addrspace(1) %src, i1 %cond)
228223
; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
229224
; CHECK-NEXT: [[ENTRY:.*:]]
230225
; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) null, ptr addrspace(1) @GN
231-
; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
232-
; CHECK: [[MEMCPY_JOIN:.*]]:
226+
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
233227
; CHECK-NEXT: ret void
234-
; CHECK: [[MEMCPY_THEN]]:
235-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) null, ptr addrspace(1) [[SRC]], i64 16, i1 false)
236-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
237-
; CHECK: [[MEMCPY_ELSE]]:
238-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GN, ptr addrspace(1) [[SRC]], i64 16, i1 false)
239-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
240228
;
241229
entry:
242230
%dst = select i1 %cond, ptr addrspace(1) null,
@@ -256,15 +244,8 @@ define amdgpu_kernel void @src_select_poison_arm(ptr addrspace(1) %dst, i1 %cond
256244
; CHECK-NEXT: [[ENTRY:.*:]]
257245
; CHECK-NEXT: [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0
258246
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) poison
259-
; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
260-
; CHECK: [[MEMCPY_JOIN:.*]]:
247+
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
261248
; CHECK-NEXT: ret void
262-
; CHECK: [[MEMCPY_THEN]]:
263-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[NONNULL]], i64 16, i1 false)
264-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
265-
; CHECK: [[MEMCPY_ELSE]]:
266-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) poison, i64 16, i1 false)
267-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
268249
;
269250
entry:
270251
%nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0
@@ -283,15 +264,8 @@ define amdgpu_kernel void @dst_select_poison_arm(ptr addrspace(1) %src, i1 %cond
283264
; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) {
284265
; CHECK-NEXT: [[ENTRY:.*:]]
285266
; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) poison, ptr addrspace(1) @GP
286-
; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
287-
; CHECK: [[MEMCPY_JOIN:.*]]:
267+
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
288268
; CHECK-NEXT: ret void
289-
; CHECK: [[MEMCPY_THEN]]:
290-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) poison, ptr addrspace(1) [[SRC]], i64 16, i1 false)
291-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
292-
; CHECK: [[MEMCPY_ELSE]]:
293-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) @GP, ptr addrspace(1) [[SRC]], i64 16, i1 false)
294-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
295269
;
296270
entry:
297271
%dst = select i1 %cond, ptr addrspace(1) poison,
@@ -354,15 +328,8 @@ define amdgpu_kernel void @memcpy_src_select_arg_arms_cfg_split(ptr addrspace(1)
354328
; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]]) {
355329
; CHECK-NEXT: [[ENTRY:.*:]]
356330
; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]]
357-
; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]]
358-
; CHECK: [[MEMCPY_JOIN:.*]]:
331+
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false)
359332
; CHECK-NEXT: ret void
360-
; CHECK: [[MEMCPY_THEN]]:
361-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[PA]], i64 16, i1 false)
362-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
363-
; CHECK: [[MEMCPY_ELSE]]:
364-
; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[PB]], i64 16, i1 false)
365-
; CHECK-NEXT: br label %[[MEMCPY_JOIN]]
366333
;
367334
ptr addrspace(1) %pa,
368335
ptr addrspace(1) %pb,

0 commit comments

Comments
 (0)