Skip to content

Commit b4fc5f3

Browse files
authored
Merge branch 'release/rocm-rel-7.0' into amd/dev/kchoi/numbafix_rel7.0
2 parents 5c2dfa4 + 95c56e1 commit b4fc5f3

File tree

11 files changed

+453
-40
lines changed

11 files changed

+453
-40
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ class LiveRegOptimizer {
7777
const GCNSubtarget &ST;
7878
/// The scalar type to convert to
7979
Type *const ConvertToScalar;
80-
/// The set of visited Instructions
81-
SmallPtrSet<Instruction *, 4> Visited;
8280
/// Map of Value -> Converted Value
8381
ValueToValueMap ValMap;
8482
/// Map of containing conversions from Optimal Type -> Original Type per BB.
@@ -248,6 +246,7 @@ bool LiveRegOptimizer::optimizeLiveType(
248246
SmallPtrSet<PHINode *, 4> PhiNodes;
249247
SmallPtrSet<Instruction *, 4> Defs;
250248
SmallPtrSet<Instruction *, 4> Uses;
249+
SmallPtrSet<Instruction *, 4> Visited;
251250

252251
Worklist.push_back(cast<Instruction>(I));
253252
while (!Worklist.empty()) {

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,8 +391,8 @@ def VOP_PERMLANE_SWAP : VOPProfile<[i32, i32, untyped, untyped]> {
391391
let HasExtDPP = 0;
392392
let HasExtSDWA = 0;
393393

394-
let Ins32 = (ins Src0RC64:$vdst_in, Src0RC32:$src0);
395-
let Ins64 = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
394+
let Ins32 = (ins DstRC:$vdst_in, Src0RC32:$src0);
395+
let Ins64 = (ins DstRC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
396396
let InsVOP3OpSel = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
397397
let Asm64 = "$vdst, $src0$bound_ctrl$fi";
398398
let AsmVOP3OpSel = "$vdst, $src0$bound_ctrl$fi";

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12799,25 +12799,47 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1279912799
InstructionCost SpillCost = getSpillCost();
1280012800
Cost += SpillCost + ExtractCost;
1280112801
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12802-
bool) {
12802+
bool ForSingleMask) {
1280312803
InstructionCost C = 0;
1280412804
unsigned VF = Mask.size();
1280512805
unsigned VecVF = TE->getVectorFactor();
12806-
if (VF != VecVF &&
12807-
(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12808-
!ShuffleVectorInst::isIdentityMask(Mask, VF))) {
12809-
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12810-
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12811-
OrigMask.begin());
12812-
C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
12813-
getWidenedType(TE->getMainOp()->getType(), VecVF),
12814-
OrigMask);
12815-
LLVM_DEBUG(
12816-
dbgs() << "SLP: Adding cost " << C
12817-
<< " for final shuffle of insertelement external users.\n";
12818-
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12819-
Cost += C;
12820-
return std::make_pair(TE, true);
12806+
bool HasLargeIndex =
12807+
any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
12808+
if ((VF != VecVF && HasLargeIndex) ||
12809+
!ShuffleVectorInst::isIdentityMask(Mask, VF)) {
12810+
12811+
if (HasLargeIndex) {
12812+
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12813+
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12814+
OrigMask.begin());
12815+
C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
12816+
getWidenedType(TE->getMainOp()->getType(), VecVF),
12817+
OrigMask);
12818+
LLVM_DEBUG(
12819+
dbgs() << "SLP: Adding cost " << C
12820+
<< " for final shuffle of insertelement external users.\n";
12821+
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12822+
Cost += C;
12823+
return std::make_pair(TE, true);
12824+
}
12825+
12826+
if (!ForSingleMask) {
12827+
SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12828+
for (unsigned I = 0; I < VF; ++I) {
12829+
if (Mask[I] != PoisonMaskElem)
12830+
ResizeMask[Mask[I]] = Mask[I];
12831+
}
12832+
if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
12833+
C = ::getShuffleCost(
12834+
*TTI, TTI::SK_PermuteSingleSrc,
12835+
getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
12836+
LLVM_DEBUG(
12837+
dbgs() << "SLP: Adding cost " << C
12838+
<< " for final shuffle of insertelement external users.\n";
12839+
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12840+
12841+
Cost += C;
12842+
}
1282112843
}
1282212844
return std::make_pair(TE, false);
1282312845
};
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-late-codegenprepare %s | FileCheck %s
3+
4+
; This crashed because the PHI with a splat was rejected, but then we marked the PHI
5+
; as visited and tried to convert one of its user afterwards.
6+
7+
define amdgpu_kernel void @widget(ptr %arg, ptr %arg1, ptr %arg2) {
8+
; CHECK-LABEL: define amdgpu_kernel void @widget(
9+
; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) {
10+
; CHECK-NEXT: [[BB:.*]]:
11+
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG]], align 4
12+
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
13+
; CHECK-NEXT: [[ARG1_LOAD:%.*]] = load <4 x i8>, ptr [[ARG1]], align 4
14+
; CHECK-NEXT: [[ARG2_LOAD:%.*]] = load i64, ptr [[ARG2]], align 4
15+
; CHECK-NEXT: br label %[[BB_1:.*]]
16+
; CHECK: [[BB_1]]:
17+
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ null, %[[BB]] ], [ [[ARG1]], %[[BB_6:.*]] ]
18+
; CHECK-NEXT: [[PHI4:%.*]] = phi <4 x i8> [ splat (i8 1), %[[BB]] ], [ [[PHI15:%.*]], %[[BB_6]] ]
19+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2:.*]], label %[[BB_6]]
20+
; CHECK: [[BB_2]]:
21+
; CHECK-NEXT: [[PHI7:%.*]] = phi <4 x i8> [ [[PHI13:%.*]], %[[BB_5:.*]] ], [ [[PHI4]], %[[BB_1]] ]
22+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4:.*]], label %[[BB_5]]
23+
; CHECK: [[BB_3:.*]]:
24+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4]], label %[[BB_EXIT:.*]]
25+
; CHECK: [[BB_4]]:
26+
; CHECK-NEXT: [[PHI11:%.*]] = phi <4 x i8> [ [[PHI7]], %[[BB_3]] ], [ zeroinitializer, %[[BB_2]] ]
27+
; CHECK-NEXT: store <4 x i8> [[PHI11]], ptr [[PHI]], align 1
28+
; CHECK-NEXT: br label %[[BB_5]]
29+
; CHECK: [[BB_5]]:
30+
; CHECK-NEXT: [[PHI13]] = phi <4 x i8> [ zeroinitializer, %[[BB_4]] ], [ [[PHI7]], %[[BB_2]] ]
31+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2]], label %[[BB_6]]
32+
; CHECK: [[BB_6]]:
33+
; CHECK-NEXT: [[PHI15]] = phi <4 x i8> [ [[ARG1_LOAD]], %[[BB_1]] ], [ zeroinitializer, %[[BB_5]] ]
34+
; CHECK-NEXT: br label %[[BB_1]]
35+
; CHECK: [[BB_EXIT]]:
36+
; CHECK-NEXT: ret void
37+
;
38+
bb:
39+
%ld = load i32, ptr %arg, align 4
40+
%ld.trunc = trunc i32 %ld to i1
41+
%arg1.load = load <4 x i8>, ptr %arg1, align 4
42+
%arg2.load = load i64, ptr %arg2, align 4
43+
br label %bb.1
44+
45+
bb.1:
46+
%phi = phi ptr [ null, %bb ], [ %arg1, %bb.6 ]
47+
%phi4 = phi <4 x i8> [ splat (i8 1), %bb ], [ %phi15, %bb.6 ]
48+
br i1 %ld.trunc, label %bb.2, label %bb.6
49+
50+
bb.2:
51+
%phi7 = phi <4 x i8> [ %phi13, %bb.5 ], [ %phi4, %bb.1 ]
52+
br i1 %ld.trunc, label %bb.4, label %bb.5
53+
54+
bb.3:
55+
br i1 %ld.trunc, label %bb.4, label %bb.exit
56+
57+
bb.4:
58+
%phi11 = phi <4 x i8> [ %phi7, %bb.3 ], [ zeroinitializer, %bb.2 ]
59+
store <4 x i8> %phi11, ptr %phi, align 1
60+
br label %bb.5
61+
62+
bb.5:
63+
%phi13 = phi <4 x i8> [ zeroinitializer, %bb.4 ], [ %phi7, %bb.2 ]
64+
br i1 %ld.trunc, label %bb.2, label %bb.6
65+
66+
bb.6:
67+
%phi15 = phi <4 x i8> [ %arg1.load, %bb.1 ], [ zeroinitializer, %bb.5 ]
68+
br label %bb.1
69+
70+
bb.exit:
71+
ret void
72+
}

llvm/test/CodeGen/AMDGPU/vni8-live-reg-opt.ll

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
4444
; FEATURE-NEXT: br label [[BB_2]]
4545
; FEATURE: bb.2:
4646
; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
47+
; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
48+
; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
4749
; FEATURE-NEXT: [[TMP2:%.*]] = trunc i32 [[PHI5_TC]] to i24
4850
; FEATURE-NEXT: [[TMP3:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
4951
; FEATURE-NEXT: store <3 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
@@ -67,6 +69,8 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
6769
; DEFAULT-NEXT: br label [[BB_2]]
6870
; DEFAULT: bb.2:
6971
; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
72+
; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
73+
; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
7074
; DEFAULT-NEXT: [[TMP2:%.*]] = trunc i32 [[PHI5_TC]] to i24
7175
; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast i24 [[TMP2]] to <3 x i8>
7276
; DEFAULT-NEXT: store <3 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4
@@ -126,6 +130,8 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
126130
; FEATURE-NEXT: br label [[BB_2]]
127131
; FEATURE: bb.2:
128132
; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
133+
; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
134+
; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
129135
; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8>
130136
; FEATURE-NEXT: store <4 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST]], align 4
131137
; FEATURE-NEXT: ret void
@@ -146,6 +152,8 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
146152
; DEFAULT-NEXT: br label [[BB_2]]
147153
; DEFAULT: bb.2:
148154
; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
155+
; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
156+
; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
149157
; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8>
150158
; DEFAULT-NEXT: store <4 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4
151159
; DEFAULT-NEXT: ret void
@@ -209,6 +217,8 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
209217
; FEATURE-NEXT: br label [[BB_2]]
210218
; FEATURE: bb.2:
211219
; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
220+
; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
221+
; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
212222
; FEATURE-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
213223
; FEATURE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
214224
; FEATURE-NEXT: store <5 x i8> [[TMP3]], ptr addrspace(1) [[DST]], align 4
@@ -232,6 +242,8 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
232242
; DEFAULT-NEXT: br label [[BB_2]]
233243
; DEFAULT: bb.2:
234244
; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
245+
; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
246+
; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
235247
; DEFAULT-NEXT: [[TMP2:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
236248
; DEFAULT-NEXT: [[PHI5:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <5 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4>
237249
; DEFAULT-NEXT: store <5 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4
@@ -291,6 +303,8 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
291303
; FEATURE-NEXT: br label [[BB_2]]
292304
; FEATURE: bb.2:
293305
; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
306+
; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
307+
; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
294308
; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
295309
; FEATURE-NEXT: store <8 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST]], align 4
296310
; FEATURE-NEXT: ret void
@@ -311,6 +325,8 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
311325
; DEFAULT-NEXT: br label [[BB_2]]
312326
; DEFAULT: bb.2:
313327
; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
328+
; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
329+
; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
314330
; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
315331
; DEFAULT-NEXT: store <8 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4
316332
; DEFAULT-NEXT: ret void
@@ -377,6 +393,8 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
377393
; FEATURE-NEXT: br label [[RETURN_SINK_SPLIT]]
378394
; FEATURE: return.sink.split:
379395
; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
396+
; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ]
397+
; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ]
380398
; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8>
381399
; FEATURE-NEXT: store <4 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST]], align 4
382400
; FEATURE-NEXT: ret void
@@ -402,6 +420,8 @@ define amdgpu_kernel void @repeat_successor(i32 %in, ptr addrspace(1) %src1, ptr
402420
; DEFAULT-NEXT: br label [[RETURN_SINK_SPLIT]]
403421
; DEFAULT: return.sink.split:
404422
; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC1_BC]], [[ENTRY]] ]
423+
; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ]
424+
; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi i32 [ [[VEC2_BC]], [[SW_BB5]] ], [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC1_BC]], [[ENTRY]] ]
405425
; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast i32 [[PHI5_TC]] to <4 x i8>
406426
; DEFAULT-NEXT: store <4 x i8> [[PHI5]], ptr addrspace(1) [[DST]], align 4
407427
; DEFAULT-NEXT: ret void
@@ -476,11 +496,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
476496
; FEATURE-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
477497
; FEATURE: bb.2:
478498
; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
499+
; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
500+
; FEATURE-NEXT: [[PHI5_TC3:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
501+
; FEATURE-NEXT: [[PHI5_TC5:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
479502
; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
480503
; FEATURE-NEXT: store <8 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST0]], align 4
481504
; FEATURE-NEXT: br label [[BB_3]]
482505
; FEATURE: bb.3:
483506
; FEATURE-NEXT: [[PHI7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC]], [[BB_2]] ]
507+
; FEATURE-NEXT: [[PHI7_TC2:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC1]], [[BB_2]] ]
508+
; FEATURE-NEXT: [[PHI7_TC4:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC3]], [[BB_2]] ]
509+
; FEATURE-NEXT: [[PHI7_TC6:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC5]], [[BB_2]] ]
484510
; FEATURE-NEXT: [[PHI7_TC_BC:%.*]] = bitcast <2 x i32> [[PHI7_TC]] to <8 x i8>
485511
; FEATURE-NEXT: store <8 x i8> [[PHI7_TC_BC]], ptr addrspace(1) [[DST1]], align 4
486512
; FEATURE-NEXT: ret void
@@ -502,11 +528,17 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
502528
; DEFAULT-NEXT: br i1 [[CMP2]], label [[BB_2]], label [[BB_3:%.*]]
503529
; DEFAULT: bb.2:
504530
; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ]
531+
; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
532+
; DEFAULT-NEXT: [[PHI5_TC3:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
533+
; DEFAULT-NEXT: [[PHI5_TC5:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
505534
; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
506535
; DEFAULT-NEXT: store <8 x i8> [[PHI5]], ptr addrspace(1) [[DST0]], align 4
507536
; DEFAULT-NEXT: br label [[BB_3]]
508537
; DEFAULT: bb.3:
509538
; DEFAULT-NEXT: [[PHI7_TC:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC]], [[BB_2]] ]
539+
; DEFAULT-NEXT: [[PHI7_TC2:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC1]], [[BB_2]] ]
540+
; DEFAULT-NEXT: [[PHI7_TC4:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC3]], [[BB_2]] ]
541+
; DEFAULT-NEXT: [[PHI7_TC6:%.*]] = phi <2 x i32> [ [[VEC2_BC]], [[BB_1]] ], [ [[PHI5_TC5]], [[BB_2]] ]
510542
; DEFAULT-NEXT: [[PHI7:%.*]] = bitcast <2 x i32> [[PHI7_TC]] to <8 x i8>
511543
; DEFAULT-NEXT: store <8 x i8> [[PHI7]], ptr addrspace(1) [[DST1]], align 4
512544
; DEFAULT-NEXT: ret void
@@ -581,6 +613,8 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
581613
; FEATURE-NEXT: br label [[BB_3]]
582614
; FEATURE: bb.3:
583615
; FEATURE-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
616+
; FEATURE-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
617+
; FEATURE-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
584618
; FEATURE-NEXT: [[PHI5_TC_BC:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
585619
; FEATURE-NEXT: store <8 x i8> [[PHI5_TC_BC]], ptr addrspace(1) [[DST1]], align 4
586620
; FEATURE-NEXT: ret void
@@ -606,6 +640,8 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
606640
; DEFAULT-NEXT: br label [[BB_3]]
607641
; DEFAULT: bb.3:
608642
; DEFAULT-NEXT: [[PHI5_TC:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
643+
; DEFAULT-NEXT: [[PHI5_TC1:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
644+
; DEFAULT-NEXT: [[PHI5_TC2:%.*]] = phi <2 x i32> [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ], [ [[VEC2_BC]], [[BB_2]] ]
609645
; DEFAULT-NEXT: [[PHI5:%.*]] = bitcast <2 x i32> [[PHI5_TC]] to <8 x i8>
610646
; DEFAULT-NEXT: store <8 x i8> [[PHI5]], ptr addrspace(1) [[DST1]], align 4
611647
; DEFAULT-NEXT: ret void
@@ -666,6 +702,8 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
666702
; FEATURE-NEXT: br label [[BB_1:%.*]]
667703
; FEATURE: bb.1:
668704
; FEATURE-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
705+
; FEATURE-NEXT: [[TEMP_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
706+
; FEATURE-NEXT: [[TEMP_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
669707
; FEATURE-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
670708
; FEATURE-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
671709
; FEATURE-NEXT: [[VEC2:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -689,6 +727,8 @@ define amdgpu_kernel void @v32i8_loop_carried(ptr addrspace(1) %src1, ptr addrsp
689727
; DEFAULT-NEXT: br label [[BB_1:%.*]]
690728
; DEFAULT: bb.1:
691729
; DEFAULT-NEXT: [[TEMP_TC:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY:%.*]] ], [ [[VEC2_BC:%.*]], [[BB_1]] ]
730+
; DEFAULT-NEXT: [[TEMP_TC1:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
731+
; DEFAULT-NEXT: [[TEMP_TC2:%.*]] = phi i32 [ [[VEC1_BC]], [[ENTRY]] ], [ [[VEC2_BC]], [[BB_1]] ]
692732
; DEFAULT-NEXT: [[TEMP_TC_BC:%.*]] = bitcast i32 [[TEMP_TC]] to <4 x i8>
693733
; DEFAULT-NEXT: [[VEC1_BC_BC:%.*]] = bitcast i32 [[VEC1_BC]] to <4 x i8>
694734
; DEFAULT-NEXT: [[VEC3:%.*]] = shufflevector <4 x i8> [[VEC1_BC_BC]], <4 x i8> [[TEMP_TC_BC]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>

0 commit comments

Comments
 (0)