Skip to content

Commit db5f7dc

Browse files
committed
Revert "[SLP]Support LShr as base for copyable elements"
This reverts commit ca4ebf9. Causes compile-time crashes for some inputs with RVV zvl512b/zvl1024b configurations. See here for a minimal reproducer: llvm#153393 (comment)
1 parent 0659044 commit db5f7dc

File tree

4 files changed

+40
-65
lines changed

4 files changed

+40
-65
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 27 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -10564,41 +10564,35 @@ class InstructionsCompatibilityAnalysis {
1056410564
unsigned MainOpcode = 0;
1056510565
Instruction *MainOp = nullptr;
1056610566

10567-
/// Checks if the opcode is supported as the main opcode for copyable
10568-
/// elements.
10569-
static bool isSupportedOpcode(const unsigned Opcode) {
10570-
return Opcode == Instruction::Add || Opcode == Instruction::LShr;
10571-
}
10572-
1057310567
/// Identifies the best candidate value, which represents main opcode
1057410568
/// operation.
1057510569
/// Currently the best candidate is the Add instruction with the parent
1057610570
/// block with the highest DFS incoming number (block, that dominates other).
1057710571
void findAndSetMainInstruction(ArrayRef<Value *> VL, const BoUpSLP &R) {
1057810572
BasicBlock *Parent = nullptr;
1057910573
// Checks if the instruction has supported opcode.
10580-
auto IsSupportedInstruction = [&](Instruction *I) {
10581-
return I && isSupportedOpcode(I->getOpcode()) &&
10574+
auto IsSupportedOpcode = [&](Instruction *I) {
10575+
return I && I->getOpcode() == Instruction::Add &&
1058210576
(!doesNotNeedToBeScheduled(I) || !R.isVectorized(I));
1058310577
};
1058410578
// Exclude operands instructions immediately to improve compile time, it
1058510579
// will be unable to schedule anyway.
1058610580
SmallDenseSet<Value *, 8> Operands;
10587-
SmallMapVector<unsigned, SmallVector<Instruction *>, 4> Candidates;
1058810581
for (Value *V : VL) {
1058910582
auto *I = dyn_cast<Instruction>(V);
1059010583
if (!I)
1059110584
continue;
1059210585
if (!DT.isReachableFromEntry(I->getParent()))
1059310586
continue;
10594-
if (Candidates.empty()) {
10595-
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10587+
if (!MainOp) {
10588+
MainOp = I;
1059610589
Parent = I->getParent();
1059710590
Operands.insert(I->op_begin(), I->op_end());
1059810591
continue;
1059910592
}
1060010593
if (Parent == I->getParent()) {
10601-
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10594+
if (!IsSupportedOpcode(MainOp) && !Operands.contains(I))
10595+
MainOp = I;
1060210596
Operands.insert(I->op_begin(), I->op_end());
1060310597
continue;
1060410598
}
@@ -10610,35 +10604,24 @@ class InstructionsCompatibilityAnalysis {
1061010604
(NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) &&
1061110605
"Different nodes should have different DFS numbers");
1061210606
if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) {
10613-
Candidates.clear();
10614-
Candidates.try_emplace(I->getOpcode()).first->second.push_back(I);
10607+
MainOp = I;
1061510608
Parent = I->getParent();
1061610609
Operands.clear();
1061710610
Operands.insert(I->op_begin(), I->op_end());
1061810611
}
1061910612
}
10620-
unsigned BestOpcodeNum = 0;
10621-
MainOp = nullptr;
10622-
for (const auto &P : Candidates) {
10623-
if (P.second.size() < BestOpcodeNum)
10624-
continue;
10625-
for (Instruction *I : P.second) {
10626-
if (IsSupportedInstruction(I) && !Operands.contains(I)) {
10627-
MainOp = I;
10628-
BestOpcodeNum = P.second.size();
10629-
break;
10630-
}
10631-
}
10613+
if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) {
10614+
MainOp = nullptr;
10615+
return;
1063210616
}
10633-
if (MainOp)
10634-
MainOpcode = MainOp->getOpcode();
10617+
MainOpcode = MainOp->getOpcode();
1063510618
}
1063610619

1063710620
/// Returns the idempotent value for the \p MainOp with the detected \p
1063810621
/// MainOpcode. For Add, returns 0. For Or, it should choose between false and
1063910622
/// the operand itself, since V or V == V.
1064010623
Value *selectBestIdempotentValue() const {
10641-
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10624+
assert(MainOpcode == Instruction::Add && "Unsupported opcode");
1064210625
return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(),
1064310626
!MainOp->isCommutative());
1064410627
}
@@ -10651,8 +10634,13 @@ class InstructionsCompatibilityAnalysis {
1065110634
return {V, V};
1065210635
if (!S.isCopyableElement(V))
1065310636
return convertTo(cast<Instruction>(V), S).second;
10654-
assert(isSupportedOpcode(MainOpcode) && "Unsupported opcode");
10655-
return {V, selectBestIdempotentValue()};
10637+
switch (MainOpcode) {
10638+
case Instruction::Add:
10639+
return {V, selectBestIdempotentValue()};
10640+
default:
10641+
break;
10642+
}
10643+
llvm_unreachable("Unsupported opcode");
1065610644
}
1065710645

1065810646
/// Builds operands for the original instructions.
@@ -10865,21 +10853,6 @@ class InstructionsCompatibilityAnalysis {
1086510853
}
1086610854
if (!Res)
1086710855
return InstructionsState::invalid();
10868-
constexpr TTI::TargetCostKind Kind = TTI::TCK_RecipThroughput;
10869-
InstructionCost ScalarCost = TTI.getInstructionCost(S.getMainOp(), Kind);
10870-
InstructionCost VectorCost;
10871-
FixedVectorType *VecTy =
10872-
getWidenedType(S.getMainOp()->getType(), VL.size());
10873-
switch (MainOpcode) {
10874-
case Instruction::Add:
10875-
case Instruction::LShr:
10876-
VectorCost = TTI.getArithmeticInstrCost(MainOpcode, VecTy, Kind);
10877-
break;
10878-
default:
10879-
llvm_unreachable("Unexpected instruction.");
10880-
}
10881-
if (VectorCost > ScalarCost)
10882-
return InstructionsState::invalid();
1088310856
return S;
1088410857
}
1088510858
assert(Operands.size() == 2 && "Unexpected number of operands!");
@@ -21117,7 +21090,6 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2111721090
ArrayRef<Value *> Op = EI.UserTE->getOperand(EI.EdgeIdx);
2111821091
const auto *It = find(Op, CD->getInst());
2111921092
assert(It != Op.end() && "Lane not set");
21120-
SmallPtrSet<Instruction *, 4> Visited;
2112121093
do {
2112221094
int Lane = std::distance(Op.begin(), It);
2112321095
assert(Lane >= 0 && "Lane not set");
@@ -21139,15 +21111,13 @@ void BoUpSLP::BlockScheduling::calculateDependencies(
2113921111
(InsertInReadyList && UseSD->isReady()))
2114021112
WorkList.push_back(UseSD);
2114121113
}
21142-
} else if (Visited.insert(In).second) {
21143-
if (ScheduleData *UseSD = getScheduleData(In)) {
21144-
CD->incDependencies();
21145-
if (!UseSD->isScheduled())
21146-
CD->incrementUnscheduledDeps(1);
21147-
if (!UseSD->hasValidDependencies() ||
21148-
(InsertInReadyList && UseSD->isReady()))
21149-
WorkList.push_back(UseSD);
21150-
}
21114+
} else if (ScheduleData *UseSD = getScheduleData(In)) {
21115+
CD->incDependencies();
21116+
if (!UseSD->isScheduled())
21117+
CD->incrementUnscheduledDeps(1);
21118+
if (!UseSD->hasValidDependencies() ||
21119+
(InsertInReadyList && UseSD->isReady()))
21120+
WorkList.push_back(UseSD);
2115121121
}
2115221122
It = find(make_range(std::next(It), Op.end()), CD->getInst());
2115321123
} while (It != Op.end());
@@ -21905,11 +21875,9 @@ bool BoUpSLP::collectValuesToDemote(
2190521875
return all_of(E.Scalars, [&](Value *V) {
2190621876
if (isa<PoisonValue>(V))
2190721877
return true;
21908-
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
21909-
if (E.isCopyableElement(V))
21910-
return MaskedValueIsZero(V, ShiftedBits, SimplifyQuery(*DL));
2191121878
auto *I = cast<Instruction>(V);
2191221879
KnownBits AmtKnownBits = computeKnownBits(I->getOperand(1), *DL);
21880+
APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
2191321881
return AmtKnownBits.getMaxValue().ult(BitWidth) &&
2191421882
MaskedValueIsZero(I->getOperand(0), ShiftedBits,
2191521883
SimplifyQuery(*DL));

llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@ define i32 @test(ptr %c) {
88
; CHECK-NEXT: [[BITLEN:%.*]] = getelementptr i8, ptr [[C]], i64 136
99
; CHECK-NEXT: [[INCDEC_PTR_3_1:%.*]] = getelementptr i8, ptr [[C]], i64 115
1010
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8
11-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 0, i32 0, i32 0>
12-
; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i64> [[TMP1]], zeroinitializer
11+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> <i32 1, i32 1, i32 1, i32 1, i32 0, i32 0>
12+
; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer
13+
; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 1, i32 0, i32 poison, i32 poison>
14+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 poison, i32 poison>
15+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 4, i32 5>
1316
; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8>
1417
; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1
1518
; CHECK-NEXT: ret i32 0

llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105-
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
104+
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105+
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
106108
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
107109
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
108110
; CHECK-NEXT: ret <4 x float> [[TMP5]]

llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,10 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16)
101101
define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) {
102102
; CHECK-LABEL: @PR16739_byval(
103103
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16
104-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
105-
; CHECK-NEXT: [[TMP3:%.*]] = lshr <4 x i64> [[TMP2]], <i64 0, i64 32, i64 0, i64 0>
104+
; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16
105+
; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32
106+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 1>
107+
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1
106108
; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32>
107109
; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float>
108110
; CHECK-NEXT: ret <4 x float> [[TMP5]]

0 commit comments

Comments
 (0)