Skip to content

Commit 7a9ad25

Browse files
committed
Recommit "[SLP][X86] Improve reordering to consider alternate instruction bundles"
This reverts commit 6d6268d. Review: https://reviews.llvm.org/D125712
1 parent ce07b95 commit 7a9ad25

File tree

7 files changed

+110
-11
lines changed

7 files changed

+110
-11
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#ifndef LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
2222
#define LLVM_ANALYSIS_TARGETTRANSFORMINFO_H
2323

24+
#include "llvm/ADT/SmallBitVector.h"
2425
#include "llvm/IR/FMF.h"
2526
#include "llvm/IR/InstrTypes.h"
2627
#include "llvm/IR/PassManager.h"
@@ -678,6 +679,16 @@ class TargetTransformInfo {
678679
/// Return true if the target supports masked expand load.
679680
bool isLegalMaskedExpandLoad(Type *DataType) const;
680681

682+
/// Return true if this is an alternating opcode pattern that can be lowered
683+
/// to a single instruction on the target. In X86 this is for the addsub
684+
/// instruction which corrsponds to a Shuffle + Fadd + FSub pattern in IR.
685+
/// This function expectes two opcodes: \p Opcode1 and \p Opcode2 being
686+
/// selected by \p OpcodeMask. The mask contains one bit per lane and is a `0`
687+
/// when \p Opcode0 is selected and `1` when Opcode1 is selected.
688+
/// \p VecTy is the vector type of the instruction to be generated.
689+
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
690+
const SmallBitVector &OpcodeMask) const;
691+
681692
/// Return true if we should be enabling ordered reductions for the target.
682693
bool enableOrderedReductions() const;
683694

@@ -1581,6 +1592,9 @@ class TargetTransformInfo::Concept {
15811592
Align Alignment) = 0;
15821593
virtual bool isLegalMaskedCompressStore(Type *DataType) = 0;
15831594
virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0;
1595+
virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
1596+
unsigned Opcode1,
1597+
const SmallBitVector &OpcodeMask) const = 0;
15841598
virtual bool enableOrderedReductions() = 0;
15851599
virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0;
15861600
virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0;
@@ -2006,6 +2020,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
20062020
bool isLegalMaskedExpandLoad(Type *DataType) override {
20072021
return Impl.isLegalMaskedExpandLoad(DataType);
20082022
}
2023+
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
2024+
const SmallBitVector &OpcodeMask) const override {
2025+
return Impl.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask);
2026+
}
20092027
bool enableOrderedReductions() override {
20102028
return Impl.enableOrderedReductions();
20112029
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,11 @@ class TargetTransformInfoImplBase {
279279

280280
bool isLegalMaskedCompressStore(Type *DataType) const { return false; }
281281

282+
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
283+
const SmallBitVector &OpcodeMask) const {
284+
return false;
285+
}
286+
282287
bool isLegalMaskedExpandLoad(Type *DataType) const { return false; }
283288

284289
bool enableOrderedReductions() const { return false; }

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,12 @@ bool TargetTransformInfo::isLegalMaskedGather(Type *DataType,
412412
return TTIImpl->isLegalMaskedGather(DataType, Alignment);
413413
}
414414

415+
bool TargetTransformInfo::isLegalAltInstr(
416+
VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
417+
const SmallBitVector &OpcodeMask) const {
418+
return TTIImpl->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask);
419+
}
420+
415421
bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType,
416422
Align Alignment) const {
417423
return TTIImpl->isLegalMaskedScatter(DataType, Alignment);

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5341,6 +5341,39 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
53415341
return IntWidth == 32 || IntWidth == 64;
53425342
}
53435343

5344+
bool X86TTIImpl::isLegalAltInstr(VectorType *VecTy, unsigned Opcode0,
5345+
unsigned Opcode1,
5346+
const SmallBitVector &OpcodeMask) const {
5347+
// ADDSUBPS 4xf32 SSE3
5348+
// VADDSUBPS 4xf32 AVX
5349+
// VADDSUBPS 8xf32 AVX2
5350+
// ADDSUBPD 2xf64 SSE3
5351+
// VADDSUBPD 2xf64 AVX
5352+
// VADDSUBPD 4xf64 AVX2
5353+
5354+
unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
5355+
assert(OpcodeMask.size() == NumElements && "Mask and VecTy are incompatible");
5356+
if (!isPowerOf2_32(NumElements))
5357+
return false;
5358+
// Check the opcode pattern. We apply the mask on the opcode arguments and
5359+
// then check if it is what we expect.
5360+
for (int Lane : seq<int>(0, NumElements)) {
5361+
unsigned Opc = OpcodeMask.test(Lane) ? Opcode1 : Opcode0;
5362+
// We expect FSub for even lanes and FAdd for odd lanes.
5363+
if (Lane % 2 == 0 && Opc != Instruction::FSub)
5364+
return false;
5365+
if (Lane % 2 == 1 && Opc != Instruction::FAdd)
5366+
return false;
5367+
}
5368+
// Now check that the pattern is supported by the target ISA.
5369+
Type *ElemTy = cast<VectorType>(VecTy)->getElementType();
5370+
if (ElemTy->isFloatTy())
5371+
return ST->hasSSE3() && NumElements % 4 == 0;
5372+
if (ElemTy->isDoubleTy())
5373+
return ST->hasSSE3() && NumElements % 2 == 0;
5374+
return false;
5375+
}
5376+
53445377
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
53455378
// AVX2 doesn't support scatter
53465379
if (!ST->hasAVX512())

llvm/lib/Target/X86/X86TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,8 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
241241
bool isLegalMaskedScatter(Type *DataType, Align Alignment);
242242
bool isLegalMaskedExpandLoad(Type *DataType);
243243
bool isLegalMaskedCompressStore(Type *DataType);
244+
bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1,
245+
const SmallBitVector &OpcodeMask) const;
244246
bool hasDivRemOp(Type *DataType, bool IsSigned);
245247
bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
246248
bool areInlineCompatible(const Function *Caller,

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3710,14 +3710,21 @@ void BoUpSLP::reorderTopToBottom() {
37103710
// their ordering.
37113711
DenseMap<const TreeEntry *, OrdersType> GathersToOrders;
37123712

3713+
// AltShuffles can also have a preferred ordering that leads to fewer
3714+
// instructions, e.g., the addsub instruction in x86.
3715+
DenseMap<const TreeEntry *, OrdersType> AltShufflesToOrders;
3716+
37133717
// Maps a TreeEntry to the reorder indices of external users.
37143718
DenseMap<const TreeEntry *, SmallVector<OrdersType, 1>>
37153719
ExternalUserReorderMap;
3720+
// FIXME: Workaround for syntax error reported by MSVC buildbots.
3721+
TargetTransformInfo &TTIRef = *TTI;
37163722
// Find all reorderable nodes with the given VF.
37173723
// Currently the are vectorized stores,loads,extracts + some gathering of
37183724
// extracts.
3719-
for_each(VectorizableTree, [this, &VFToOrderedEntries, &GathersToOrders,
3720-
&ExternalUserReorderMap](
3725+
for_each(VectorizableTree, [this, &TTIRef, &VFToOrderedEntries,
3726+
&GathersToOrders, &ExternalUserReorderMap,
3727+
&AltShufflesToOrders](
37213728
const std::unique_ptr<TreeEntry> &TE) {
37223729
// Look for external users that will probably be vectorized.
37233730
SmallVector<OrdersType, 1> ExternalUserReorderIndices =
@@ -3728,6 +3735,27 @@ void BoUpSLP::reorderTopToBottom() {
37283735
std::move(ExternalUserReorderIndices));
37293736
}
37303737

3738+
// Patterns like [fadd,fsub] can be combined into a single instruction in
3739+
// x86. Reordering them into [fsub,fadd] blocks this pattern. So we need
3740+
// to take into account their order when looking for the most used order.
3741+
if (TE->isAltShuffle()) {
3742+
VectorType *VecTy =
3743+
FixedVectorType::get(TE->Scalars[0]->getType(), TE->Scalars.size());
3744+
unsigned Opcode0 = TE->getOpcode();
3745+
unsigned Opcode1 = TE->getAltOpcode();
3746+
// The opcode mask selects between the two opcodes.
3747+
SmallBitVector OpcodeMask(TE->Scalars.size(), 0);
3748+
for (unsigned Lane : seq<unsigned>(0, TE->Scalars.size()))
3749+
if (cast<Instruction>(TE->Scalars[Lane])->getOpcode() == Opcode1)
3750+
OpcodeMask.set(Lane);
3751+
// If this pattern is supported by the target then we consider the order.
3752+
if (TTIRef.isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) {
3753+
VFToOrderedEntries[TE->Scalars.size()].insert(TE.get());
3754+
AltShufflesToOrders.try_emplace(TE.get(), OrdersType());
3755+
}
3756+
// TODO: Check the reverse order too.
3757+
}
3758+
37313759
if (Optional<OrdersType> CurrentOrder =
37323760
getReorderingData(*TE, /*TopToBottom=*/true)) {
37333761
// Do not include ordering for nodes used in the alt opcode vectorization,
@@ -3778,12 +3806,18 @@ void BoUpSLP::reorderTopToBottom() {
37783806
if (!OpTE->ReuseShuffleIndices.empty())
37793807
continue;
37803808
// Count number of orders uses.
3781-
const auto &Order = [OpTE, &GathersToOrders]() -> const OrdersType & {
3809+
const auto &Order = [OpTE, &GathersToOrders,
3810+
&AltShufflesToOrders]() -> const OrdersType & {
37823811
if (OpTE->State == TreeEntry::NeedToGather) {
37833812
auto It = GathersToOrders.find(OpTE);
37843813
if (It != GathersToOrders.end())
37853814
return It->second;
37863815
}
3816+
if (OpTE->isAltShuffle()) {
3817+
auto It = AltShufflesToOrders.find(OpTE);
3818+
if (It != AltShufflesToOrders.end())
3819+
return It->second;
3820+
}
37873821
return OpTE->ReorderIndices;
37883822
}();
37893823
// First consider the order of the external scalar users.

llvm/test/Transforms/SLPVectorizer/X86/reorder_with_external_users.ll

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -118,20 +118,21 @@ define void @addsub_and_external_users(double *%A, double *%ptr) {
118118
; CHECK-NEXT: [[LD:%.*]] = load double, double* undef, align 8
119119
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> poison, double [[LD]], i32 0
120120
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[LD]], i32 1
121-
; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
122-
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], <double 1.200000e+00, double 1.100000e+00>
123-
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 2, i32 1>
124-
; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP4]], <double 2.200000e+00, double 2.100000e+00>
125-
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 3.200000e+00, double 3.100000e+00>
121+
; CHECK-NEXT: [[TMP2:%.*]] = fsub <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
122+
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], <double 1.100000e+00, double 1.200000e+00>
123+
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP3]], <2 x i32> <i32 0, i32 3>
124+
; CHECK-NEXT: [[TMP5:%.*]] = fdiv <2 x double> [[TMP4]], <double 2.100000e+00, double 2.200000e+00>
125+
; CHECK-NEXT: [[TMP6:%.*]] = fmul <2 x double> [[TMP5]], <double 3.100000e+00, double 3.200000e+00>
126126
; CHECK-NEXT: [[PTRA0:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 0
127+
; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <2 x i32> <i32 1, i32 0>
127128
; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[PTRA0]] to <2 x double>*
128-
; CHECK-NEXT: store <2 x double> [[TMP6]], <2 x double>* [[TMP7]], align 8
129+
; CHECK-NEXT: store <2 x double> [[SHUFFLE]], <2 x double>* [[TMP7]], align 8
129130
; CHECK-NEXT: br label [[BB2:%.*]]
130131
; CHECK: bb2:
131-
; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], <double 4.200000e+00, double 4.100000e+00>
132+
; CHECK-NEXT: [[TMP8:%.*]] = fadd <2 x double> [[TMP6]], <double 4.100000e+00, double 4.200000e+00>
132133
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x double> [[TMP8]], i32 0
133134
; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP8]], i32 1
134-
; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP10]], [[TMP9]]
135+
; CHECK-NEXT: [[SEED:%.*]] = fcmp ogt double [[TMP9]], [[TMP10]]
135136
; CHECK-NEXT: ret void
136137
;
137138
bb1:

0 commit comments

Comments
 (0)