Skip to content

Commit 74230ff

Browse files
[SLP]Improved/fixed FMAD support in reductions
In the initial patch for FMAD, potential FMAD nodes were completely excluded from the reduction analysis for the smaller patch. But it may cause regressions. This patch adds better detection of scalar FMAD reduction operations and tries to correctly calculate the costs of the FMAD reduction operations (also, excluding the costs of the scalar fmuls) and split reduction operations, combined with regular FMADs. Reviewers: RKSimon, gregbedwell, hiraditya Reviewed By: RKSimon Pull Request: #152787
1 parent fdfc751 commit 74230ff

File tree

4 files changed

+140
-72
lines changed

4 files changed

+140
-72
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 61 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23842,7 +23842,8 @@ class HorizontalReduction {
2384223842

2384323843
/// Attempt to vectorize the tree found by matchAssociativeReduction.
2384423844
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23845-
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
23845+
const TargetLibraryInfo &TLI, AssumptionCache *AC,
23846+
DominatorTree &DT) {
2384623847
constexpr unsigned RegMaxNumber = 4;
2384723848
constexpr unsigned RedValsMaxNumber = 128;
2384823849
// If there are a sufficient number of reduction values, reduce
@@ -24241,7 +24242,7 @@ class HorizontalReduction {
2424124242

2424224243
// Estimate cost.
2424324244
InstructionCost ReductionCost =
24244-
getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
24245+
getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
2424524246
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
2424624247
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
2424724248
<< " for reduction\n");
@@ -24546,7 +24547,9 @@ class HorizontalReduction {
2454624547
InstructionCost getReductionCost(TargetTransformInfo *TTI,
2454724548
ArrayRef<Value *> ReducedVals,
2454824549
bool IsCmpSelMinMax, FastMathFlags FMF,
24549-
const BoUpSLP &R) {
24550+
const BoUpSLP &R, DominatorTree &DT,
24551+
const DataLayout &DL,
24552+
const TargetLibraryInfo &TLI) {
2455024553
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2455124554
Type *ScalarTy = ReducedVals.front()->getType();
2455224555
unsigned ReduxWidth = ReducedVals.size();
@@ -24571,6 +24574,22 @@ class HorizontalReduction {
2457124574
for (User *U : RdxVal->users()) {
2457224575
auto *RdxOp = cast<Instruction>(U);
2457324576
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24577+
if (RdxKind == RecurKind::FAdd) {
24578+
InstructionCost FMACost = canConvertToFMA(
24579+
RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24580+
if (FMACost.isValid()) {
24581+
LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24582+
if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24583+
// Also, exclude scalar fmul cost.
24584+
InstructionCost FMulCost =
24585+
TTI->getInstructionCost(I, CostKind);
24586+
LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24587+
FMACost -= FMulCost;
24588+
}
24589+
ScalarCost += FMACost;
24590+
continue;
24591+
}
24592+
}
2457424593
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
2457524594
continue;
2457624595
}
@@ -24635,8 +24654,43 @@ class HorizontalReduction {
2463524654
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
2463624655
std::make_pair(RedTy, true));
2463724656
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24638-
VectorCost +=
24639-
TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24657+
InstructionCost FMACost = InstructionCost::getInvalid();
24658+
if (RdxKind == RecurKind::FAdd) {
24659+
// Check if the reduction operands can be converted to FMA.
24660+
SmallVector<Value *> Ops;
24661+
FastMathFlags FMF;
24662+
FMF.set();
24663+
for (Value *RdxVal : ReducedVals) {
24664+
if (!RdxVal->hasOneUse()) {
24665+
Ops.clear();
24666+
break;
24667+
}
24668+
if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24669+
FMF &= FPCI->getFastMathFlags();
24670+
Ops.push_back(RdxVal->user_back());
24671+
}
24672+
FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24673+
*TTI, TLI);
24674+
if (FMACost.isValid()) {
24675+
// Calculate actual FMAD cost.
24676+
IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24677+
{RVecTy, RVecTy, RVecTy}, FMF);
24678+
FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24679+
24680+
LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24681+
// Also, exclude vector fmul cost.
24682+
InstructionCost FMulCost = TTI->getArithmeticInstrCost(
24683+
Instruction::FMul, RVecTy, CostKind);
24684+
LLVM_DEBUG(dbgs()
24685+
<< "Minus vector FMul cost: " << FMulCost << "\n");
24686+
FMACost -= FMulCost;
24687+
}
24688+
}
24689+
if (FMACost.isValid())
24690+
VectorCost += FMACost;
24691+
else
24692+
VectorCost +=
24693+
TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
2464024694
if (RType != RedTy) {
2464124695
unsigned Opcode = Instruction::Trunc;
2464224696
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25304,7 +25358,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
2530425358
HorizontalReduction HorRdx;
2530525359
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
2530625360
return nullptr;
25307-
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
25361+
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
2530825362
};
2530925363
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
2531025364
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25449,7 +25503,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
2544925503
if (RedCost >= ScalarCost)
2545025504
return false;
2545125505

25452-
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
25506+
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
2545325507
};
2545425508
if (Candidates.size() == 1)
2545525509
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);

llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll

Lines changed: 26 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -709,34 +709,25 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
709709

710710

711711
define double @dot_product_fp64(ptr %a, ptr %b) {
712-
; NON-POW2-LABEL: @dot_product_fp64(
713-
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
714-
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
715-
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
716-
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
717-
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
718-
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
719-
; NON-POW2-NEXT: ret double [[TMP4]]
720-
;
721-
; POW2-ONLY-LABEL: @dot_product_fp64(
722-
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
723-
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
724-
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
725-
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
726-
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
727-
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
728-
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
729-
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
730-
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
731-
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
732-
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
733-
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
734-
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
735-
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
736-
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
737-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
738-
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
739-
; POW2-ONLY-NEXT: ret double [[ADD_1]]
712+
; CHECK-LABEL: @dot_product_fp64(
713+
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
714+
; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
715+
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
716+
; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
717+
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
718+
; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
719+
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
720+
; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
721+
; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
722+
; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
723+
; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
724+
; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
725+
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
726+
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
727+
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
728+
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
729+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
730+
; CHECK-NEXT: ret double [[ADD_1]]
740731
;
741732
%gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
742733
%l.a.0 = load double, ptr %gep.a.0, align 4
@@ -793,21 +784,13 @@ entry:
793784
}
794785

795786
define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
796-
; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
797-
; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
798-
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
799-
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
800-
; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
801-
; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
802-
; NON-POW2-NEXT: ret float [[TMP5]]
803-
;
804-
; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
805-
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
806-
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
807-
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
808-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
809-
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
810-
; POW2-ONLY-NEXT: ret float [[ADD_1]]
787+
; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
788+
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
789+
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
790+
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
791+
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
792+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
793+
; CHECK-NEXT: ret float [[ADD_1]]
811794
;
812795
%mul.0 = fmul fast float %a, 10.0
813796
%mul.1 = fmul fast float %b, 10.0

llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,24 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
1010
define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
1111
; CHECK-LABEL: @test(
1212
; CHECK-NEXT: entry:
13-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
14-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
15-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
13+
; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
1614
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
17-
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
18-
; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
19-
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
20-
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
21-
; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
22-
; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
23-
; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
24-
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
25-
; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
15+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
16+
; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
17+
; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
18+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19+
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
20+
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
21+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
22+
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]]
23+
; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]])
24+
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8
25+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
26+
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]]
27+
; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]]
28+
; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]])
29+
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
30+
; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
2631
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
2732
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
2833
; CHECK-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
33
; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
4-
; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
4+
; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX2
55

66
; This test checks for a case when a horizontal reduction of floating-point
77
; adds may look profitable, but is not because it eliminates generation of
@@ -26,13 +26,27 @@ define void @hr() {
2626
; AVX: loop:
2727
; AVX-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
2828
; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double
29-
; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
30-
; AVX-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
31-
; AVX-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
32-
; AVX-NEXT: [[ADD3]] = fadd fast double [[TMP3]], [[PHI0]]
29+
; AVX-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
30+
; AVX-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
31+
; AVX-NEXT: [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]]
32+
; AVX-NEXT: [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]]
33+
; AVX-NEXT: [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]]
3334
; AVX-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
3435
; AVX: exit:
3536
; AVX-NEXT: ret void
37+
;
38+
; AVX2-LABEL: @hr(
39+
; AVX2-NEXT: br label [[LOOP:%.*]]
40+
; AVX2: loop:
41+
; AVX2-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
42+
; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double
43+
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
44+
; AVX2-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
45+
; AVX2-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
46+
; AVX2-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
47+
; AVX2-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
48+
; AVX2: exit:
49+
; AVX2-NEXT: ret void
3650
;
3751
br label %loop
3852

@@ -70,12 +84,24 @@ define double @hr_or_mul() {
7084
;
7185
; AVX-LABEL: @hr_or_mul(
7286
; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double
73-
; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
74-
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
75-
; AVX-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
76-
; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
87+
; AVX-NEXT: [[TMP4:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
7788
; AVX-NEXT: [[ADD3:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
78-
; AVX-NEXT: ret double [[ADD3]]
89+
; AVX-NEXT: [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
90+
; AVX-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD3]]
91+
; AVX-NEXT: [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
92+
; AVX-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
93+
; AVX-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
94+
; AVX-NEXT: [[ADD4:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
95+
; AVX-NEXT: ret double [[ADD4]]
96+
;
97+
; AVX2-LABEL: @hr_or_mul(
98+
; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double
99+
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
100+
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
101+
; AVX2-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
102+
; AVX2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
103+
; AVX2-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
104+
; AVX2-NEXT: ret double [[OP_RDX]]
79105
;
80106
%cvt0 = uitofp i16 3 to double
81107
%mul0 = fmul fast double 7.000000e+00, %cvt0

0 commit comments

Comments
 (0)