Skip to content

Commit 9c5abbb

Browse files
authored
Merge branch 'release/rocm-rel-7.0' into amd/dev/chfang/cherry
2 parents f1830f8 + a74b2f2 commit 9c5abbb

File tree

17 files changed

+520
-130
lines changed

17 files changed

+520
-130
lines changed

llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -77,8 +77,6 @@ class LiveRegOptimizer {
7777
const GCNSubtarget &ST;
7878
/// The scalar type to convert to
7979
Type *const ConvertToScalar;
80-
/// The set of visited Instructions
81-
SmallPtrSet<Instruction *, 4> Visited;
8280
/// Map of Value -> Converted Value
8381
ValueToValueMap ValMap;
8482
/// Map of containing conversions from Optimal Type -> Original Type per BB.
@@ -248,6 +246,7 @@ bool LiveRegOptimizer::optimizeLiveType(
248246
SmallPtrSet<PHINode *, 4> PhiNodes;
249247
SmallPtrSet<Instruction *, 4> Defs;
250248
SmallPtrSet<Instruction *, 4> Uses;
249+
SmallPtrSet<Instruction *, 4> Visited;
251250

252251
Worklist.push_back(cast<Instruction>(I));
253252
while (!Worklist.empty()) {

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,8 +391,8 @@ def VOP_PERMLANE_SWAP : VOPProfile<[i32, i32, untyped, untyped]> {
391391
let HasExtDPP = 0;
392392
let HasExtSDWA = 0;
393393

394-
let Ins32 = (ins Src0RC64:$vdst_in, Src0RC32:$src0);
395-
let Ins64 = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
394+
let Ins32 = (ins DstRC:$vdst_in, Src0RC32:$src0);
395+
let Ins64 = (ins DstRC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
396396
let InsVOP3OpSel = (ins Src0RC64:$vdst_in, Src0RC64:$src0, Dpp16FI:$fi, DppBoundCtrl:$bound_ctrl);
397397
let Asm64 = "$vdst, $src0$bound_ctrl$fi";
398398
let AsmVOP3OpSel = "$vdst, $src0$bound_ctrl$fi";

llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,15 @@ bool PointerReplacer::collectUsers() {
278278
Worklist.emplace_back(I);
279279
};
280280

281+
auto TryPushInstOperand = [&](Instruction *InstOp) {
282+
if (!UsersToReplace.contains(InstOp)) {
283+
if (!ValuesToRevisit.insert(InstOp))
284+
return false;
285+
Worklist.emplace_back(InstOp);
286+
}
287+
return true;
288+
};
289+
281290
PushUsersToWorklist(&Root);
282291
while (!Worklist.empty()) {
283292
Instruction *Inst = Worklist.pop_back_val();
@@ -310,21 +319,26 @@ bool PointerReplacer::collectUsers() {
310319
// incoming values.
311320
Worklist.emplace_back(PHI);
312321
for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); ++Idx) {
313-
auto *IncomingValue = cast<Instruction>(PHI->getIncomingValue(Idx));
314-
if (UsersToReplace.contains(IncomingValue))
315-
continue;
316-
if (!ValuesToRevisit.insert(IncomingValue))
322+
if (!TryPushInstOperand(cast<Instruction>(PHI->getIncomingValue(Idx))))
317323
return false;
318-
Worklist.emplace_back(IncomingValue);
319324
}
320325
} else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
321326
auto *TrueInst = dyn_cast<Instruction>(SI->getTrueValue());
322327
auto *FalseInst = dyn_cast<Instruction>(SI->getFalseValue());
323328
if (!TrueInst || !FalseInst)
324329
return false;
325330

326-
UsersToReplace.insert(SI);
327-
PushUsersToWorklist(SI);
331+
if (isAvailable(TrueInst) && isAvailable(FalseInst)) {
332+
UsersToReplace.insert(SI);
333+
PushUsersToWorklist(SI);
334+
continue;
335+
}
336+
337+
// Push select back onto the stack, followed by unavailable true/false
338+
// value.
339+
Worklist.emplace_back(SI);
340+
if (!TryPushInstOperand(TrueInst) || !TryPushInstOperand(FalseInst))
341+
return false;
328342
} else if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
329343
UsersToReplace.insert(GEP);
330344
PushUsersToWorklist(GEP);

llvm/lib/Transforms/Scalar/ConstraintElimination.cpp

Lines changed: 7 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -313,8 +313,7 @@ class ConstraintInfo {
313313
/// New variables that need to be added to the system are collected in
314314
/// \p NewVariables.
315315
ConstraintTy getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
316-
SmallVectorImpl<Value *> &NewVariables,
317-
bool ForceSignedSystem = false) const;
316+
SmallVectorImpl<Value *> &NewVariables) const;
318317

319318
/// Turns a comparison of the form \p Op0 \p Pred \p Op1 into a vector of
320319
/// constraints using getConstraint. Returns an empty constraint if the result
@@ -331,14 +330,6 @@ class ConstraintInfo {
331330
void transferToOtherSystem(CmpInst::Predicate Pred, Value *A, Value *B,
332331
unsigned NumIn, unsigned NumOut,
333332
SmallVectorImpl<StackEntry> &DFSInStack);
334-
335-
private:
336-
/// Adds facts into constraint system. \p ForceSignedSystem can be set when
337-
/// the \p Pred is eq/ne, and signed constraint system is used when it's
338-
/// specified.
339-
void addFactImpl(CmpInst::Predicate Pred, Value *A, Value *B, unsigned NumIn,
340-
unsigned NumOut, SmallVectorImpl<StackEntry> &DFSInStack,
341-
bool ForceSignedSystem);
342333
};
343334

344335
/// Represents a (Coefficient * Variable) entry after IR decomposition.
@@ -645,12 +636,8 @@ static Decomposition decompose(Value *V,
645636

646637
ConstraintTy
647638
ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
648-
SmallVectorImpl<Value *> &NewVariables,
649-
bool ForceSignedSystem) const {
639+
SmallVectorImpl<Value *> &NewVariables) const {
650640
assert(NewVariables.empty() && "NewVariables must be empty when passed in");
651-
assert((!ForceSignedSystem || CmpInst::isEquality(Pred)) &&
652-
"signed system can only be forced on eq/ne");
653-
654641
bool IsEq = false;
655642
bool IsNe = false;
656643

@@ -665,15 +652,15 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
665652
break;
666653
}
667654
case CmpInst::ICMP_EQ:
668-
if (!ForceSignedSystem && match(Op1, m_Zero())) {
655+
if (match(Op1, m_Zero())) {
669656
Pred = CmpInst::ICMP_ULE;
670657
} else {
671658
IsEq = true;
672659
Pred = CmpInst::ICMP_ULE;
673660
}
674661
break;
675662
case CmpInst::ICMP_NE:
676-
if (!ForceSignedSystem && match(Op1, m_Zero())) {
663+
if (match(Op1, m_Zero())) {
677664
Pred = CmpInst::getSwappedPredicate(CmpInst::ICMP_UGT);
678665
std::swap(Op0, Op1);
679666
} else {
@@ -690,7 +677,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
690677
return {};
691678

692679
SmallVector<ConditionTy, 4> Preconditions;
693-
bool IsSigned = ForceSignedSystem || CmpInst::isSigned(Pred);
680+
bool IsSigned = CmpInst::isSigned(Pred);
694681
auto &Value2Index = getValue2Index(IsSigned);
695682
auto ADec = decompose(Op0->stripPointerCastsSameRepresentation(),
696683
Preconditions, IsSigned, DL);
@@ -750,7 +737,7 @@ ConstraintInfo::getConstraint(CmpInst::Predicate Pred, Value *Op0, Value *Op1,
750737
int64_t OffsetSum;
751738
if (AddOverflow(Offset1, Offset2, OffsetSum))
752739
return {};
753-
if (Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT)
740+
if (Pred == (IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT))
754741
if (AddOverflow(OffsetSum, int64_t(-1), OffsetSum))
755742
return {};
756743
R[0] = OffsetSum;
@@ -1593,20 +1580,10 @@ static bool checkOrAndOpImpliedByOther(
15931580
void ConstraintInfo::addFact(CmpInst::Predicate Pred, Value *A, Value *B,
15941581
unsigned NumIn, unsigned NumOut,
15951582
SmallVectorImpl<StackEntry> &DFSInStack) {
1596-
addFactImpl(Pred, A, B, NumIn, NumOut, DFSInStack, false);
1597-
// If the Pred is eq/ne, also add the fact to signed system.
1598-
if (CmpInst::isEquality(Pred))
1599-
addFactImpl(Pred, A, B, NumIn, NumOut, DFSInStack, true);
1600-
}
1601-
1602-
void ConstraintInfo::addFactImpl(CmpInst::Predicate Pred, Value *A, Value *B,
1603-
unsigned NumIn, unsigned NumOut,
1604-
SmallVectorImpl<StackEntry> &DFSInStack,
1605-
bool ForceSignedSystem) {
16061583
// If the constraint has a pre-condition, skip the constraint if it does not
16071584
// hold.
16081585
SmallVector<Value *> NewVariables;
1609-
auto R = getConstraint(Pred, A, B, NewVariables, ForceSignedSystem);
1586+
auto R = getConstraint(Pred, A, B, NewVariables);
16101587

16111588
// TODO: Support non-equality for facts as well.
16121589
if (!R.isValid(*this) || R.isNe())

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12799,25 +12799,47 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
1279912799
InstructionCost SpillCost = getSpillCost();
1280012800
Cost += SpillCost + ExtractCost;
1280112801
auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef<int> Mask,
12802-
bool) {
12802+
bool ForSingleMask) {
1280312803
InstructionCost C = 0;
1280412804
unsigned VF = Mask.size();
1280512805
unsigned VecVF = TE->getVectorFactor();
12806-
if (VF != VecVF &&
12807-
(any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); }) ||
12808-
!ShuffleVectorInst::isIdentityMask(Mask, VF))) {
12809-
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12810-
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12811-
OrigMask.begin());
12812-
C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
12813-
getWidenedType(TE->getMainOp()->getType(), VecVF),
12814-
OrigMask);
12815-
LLVM_DEBUG(
12816-
dbgs() << "SLP: Adding cost " << C
12817-
<< " for final shuffle of insertelement external users.\n";
12818-
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12819-
Cost += C;
12820-
return std::make_pair(TE, true);
12806+
bool HasLargeIndex =
12807+
any_of(Mask, [VF](int Idx) { return Idx >= static_cast<int>(VF); });
12808+
if ((VF != VecVF && HasLargeIndex) ||
12809+
!ShuffleVectorInst::isIdentityMask(Mask, VF)) {
12810+
12811+
if (HasLargeIndex) {
12812+
SmallVector<int> OrigMask(VecVF, PoisonMaskElem);
12813+
std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)),
12814+
OrigMask.begin());
12815+
C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
12816+
getWidenedType(TE->getMainOp()->getType(), VecVF),
12817+
OrigMask);
12818+
LLVM_DEBUG(
12819+
dbgs() << "SLP: Adding cost " << C
12820+
<< " for final shuffle of insertelement external users.\n";
12821+
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12822+
Cost += C;
12823+
return std::make_pair(TE, true);
12824+
}
12825+
12826+
if (!ForSingleMask) {
12827+
SmallVector<int> ResizeMask(VF, PoisonMaskElem);
12828+
for (unsigned I = 0; I < VF; ++I) {
12829+
if (Mask[I] != PoisonMaskElem)
12830+
ResizeMask[Mask[I]] = Mask[I];
12831+
}
12832+
if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF))
12833+
C = ::getShuffleCost(
12834+
*TTI, TTI::SK_PermuteSingleSrc,
12835+
getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask);
12836+
LLVM_DEBUG(
12837+
dbgs() << "SLP: Adding cost " << C
12838+
<< " for final shuffle of insertelement external users.\n";
12839+
TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n");
12840+
12841+
Cost += C;
12842+
}
1282112843
}
1282212844
return std::make_pair(TE, false);
1282312845
};
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-late-codegenprepare %s | FileCheck %s
3+
4+
; This crashed because the PHI with a splat was rejected, but then we marked the PHI
5+
; as visited and tried to convert one of its user afterwards.
6+
7+
define amdgpu_kernel void @widget(ptr %arg, ptr %arg1, ptr %arg2) {
8+
; CHECK-LABEL: define amdgpu_kernel void @widget(
9+
; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) {
10+
; CHECK-NEXT: [[BB:.*]]:
11+
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARG]], align 4
12+
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[TMP0]] to i1
13+
; CHECK-NEXT: [[ARG1_LOAD:%.*]] = load <4 x i8>, ptr [[ARG1]], align 4
14+
; CHECK-NEXT: [[ARG2_LOAD:%.*]] = load i64, ptr [[ARG2]], align 4
15+
; CHECK-NEXT: br label %[[BB_1:.*]]
16+
; CHECK: [[BB_1]]:
17+
; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ null, %[[BB]] ], [ [[ARG1]], %[[BB_6:.*]] ]
18+
; CHECK-NEXT: [[PHI4:%.*]] = phi <4 x i8> [ splat (i8 1), %[[BB]] ], [ [[PHI15:%.*]], %[[BB_6]] ]
19+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2:.*]], label %[[BB_6]]
20+
; CHECK: [[BB_2]]:
21+
; CHECK-NEXT: [[PHI7:%.*]] = phi <4 x i8> [ [[PHI13:%.*]], %[[BB_5:.*]] ], [ [[PHI4]], %[[BB_1]] ]
22+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4:.*]], label %[[BB_5]]
23+
; CHECK: [[BB_3:.*]]:
24+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_4]], label %[[BB_EXIT:.*]]
25+
; CHECK: [[BB_4]]:
26+
; CHECK-NEXT: [[PHI11:%.*]] = phi <4 x i8> [ [[PHI7]], %[[BB_3]] ], [ zeroinitializer, %[[BB_2]] ]
27+
; CHECK-NEXT: store <4 x i8> [[PHI11]], ptr [[PHI]], align 1
28+
; CHECK-NEXT: br label %[[BB_5]]
29+
; CHECK: [[BB_5]]:
30+
; CHECK-NEXT: [[PHI13]] = phi <4 x i8> [ zeroinitializer, %[[BB_4]] ], [ [[PHI7]], %[[BB_2]] ]
31+
; CHECK-NEXT: br i1 [[TMP1]], label %[[BB_2]], label %[[BB_6]]
32+
; CHECK: [[BB_6]]:
33+
; CHECK-NEXT: [[PHI15]] = phi <4 x i8> [ [[ARG1_LOAD]], %[[BB_1]] ], [ zeroinitializer, %[[BB_5]] ]
34+
; CHECK-NEXT: br label %[[BB_1]]
35+
; CHECK: [[BB_EXIT]]:
36+
; CHECK-NEXT: ret void
37+
;
38+
bb:
39+
%ld = load i32, ptr %arg, align 4
40+
%ld.trunc = trunc i32 %ld to i1
41+
%arg1.load = load <4 x i8>, ptr %arg1, align 4
42+
%arg2.load = load i64, ptr %arg2, align 4
43+
br label %bb.1
44+
45+
bb.1:
46+
%phi = phi ptr [ null, %bb ], [ %arg1, %bb.6 ]
47+
%phi4 = phi <4 x i8> [ splat (i8 1), %bb ], [ %phi15, %bb.6 ]
48+
br i1 %ld.trunc, label %bb.2, label %bb.6
49+
50+
bb.2:
51+
%phi7 = phi <4 x i8> [ %phi13, %bb.5 ], [ %phi4, %bb.1 ]
52+
br i1 %ld.trunc, label %bb.4, label %bb.5
53+
54+
bb.3:
55+
br i1 %ld.trunc, label %bb.4, label %bb.exit
56+
57+
bb.4:
58+
%phi11 = phi <4 x i8> [ %phi7, %bb.3 ], [ zeroinitializer, %bb.2 ]
59+
store <4 x i8> %phi11, ptr %phi, align 1
60+
br label %bb.5
61+
62+
bb.5:
63+
%phi13 = phi <4 x i8> [ zeroinitializer, %bb.4 ], [ %phi7, %bb.2 ]
64+
br i1 %ld.trunc, label %bb.2, label %bb.6
65+
66+
bb.6:
67+
%phi15 = phi <4 x i8> [ %arg1.load, %bb.1 ], [ zeroinitializer, %bb.5 ]
68+
br label %bb.1
69+
70+
bb.exit:
71+
ret void
72+
}

0 commit comments

Comments
 (0)