Skip to content

Commit 6d902b6

Browse files
committed
Revert "[SLP]Improved/fixed FMAD support in reductions"
This reverts commit 74230ff to fix the bugs found during local testing.
1 parent 314dc33 commit 6d902b6

File tree

4 files changed

+72
-140
lines changed

4 files changed

+72
-140
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 7 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -23842,8 +23842,7 @@ class HorizontalReduction {
2384223842

2384323843
/// Attempt to vectorize the tree found by matchAssociativeReduction.
2384423844
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23845-
const TargetLibraryInfo &TLI, AssumptionCache *AC,
23846-
DominatorTree &DT) {
23845+
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
2384723846
constexpr unsigned RegMaxNumber = 4;
2384823847
constexpr unsigned RedValsMaxNumber = 128;
2384923848
// If there are a sufficient number of reduction values, reduce
@@ -24242,7 +24241,7 @@ class HorizontalReduction {
2424224241

2424324242
// Estimate cost.
2424424243
InstructionCost ReductionCost =
24245-
getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
24244+
getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
2424624245
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
2424724246
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
2424824247
<< " for reduction\n");
@@ -24547,9 +24546,7 @@ class HorizontalReduction {
2454724546
InstructionCost getReductionCost(TargetTransformInfo *TTI,
2454824547
ArrayRef<Value *> ReducedVals,
2454924548
bool IsCmpSelMinMax, FastMathFlags FMF,
24550-
const BoUpSLP &R, DominatorTree &DT,
24551-
const DataLayout &DL,
24552-
const TargetLibraryInfo &TLI) {
24549+
const BoUpSLP &R) {
2455324550
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2455424551
Type *ScalarTy = ReducedVals.front()->getType();
2455524552
unsigned ReduxWidth = ReducedVals.size();
@@ -24574,22 +24571,6 @@ class HorizontalReduction {
2457424571
for (User *U : RdxVal->users()) {
2457524572
auto *RdxOp = cast<Instruction>(U);
2457624573
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24577-
if (RdxKind == RecurKind::FAdd) {
24578-
InstructionCost FMACost = canConvertToFMA(
24579-
RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24580-
if (FMACost.isValid()) {
24581-
LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24582-
if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24583-
// Also, exclude scalar fmul cost.
24584-
InstructionCost FMulCost =
24585-
TTI->getInstructionCost(I, CostKind);
24586-
LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24587-
FMACost -= FMulCost;
24588-
}
24589-
ScalarCost += FMACost;
24590-
continue;
24591-
}
24592-
}
2459324574
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
2459424575
continue;
2459524576
}
@@ -24654,43 +24635,8 @@ class HorizontalReduction {
2465424635
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
2465524636
std::make_pair(RedTy, true));
2465624637
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24657-
InstructionCost FMACost = InstructionCost::getInvalid();
24658-
if (RdxKind == RecurKind::FAdd) {
24659-
// Check if the reduction operands can be converted to FMA.
24660-
SmallVector<Value *> Ops;
24661-
FastMathFlags FMF;
24662-
FMF.set();
24663-
for (Value *RdxVal : ReducedVals) {
24664-
if (!RdxVal->hasOneUse()) {
24665-
Ops.clear();
24666-
break;
24667-
}
24668-
if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24669-
FMF &= FPCI->getFastMathFlags();
24670-
Ops.push_back(RdxVal->user_back());
24671-
}
24672-
FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24673-
*TTI, TLI);
24674-
if (FMACost.isValid()) {
24675-
// Calculate actual FMAD cost.
24676-
IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24677-
{RVecTy, RVecTy, RVecTy}, FMF);
24678-
FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24679-
24680-
LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24681-
// Also, exclude vector fmul cost.
24682-
InstructionCost FMulCost = TTI->getArithmeticInstrCost(
24683-
Instruction::FMul, RVecTy, CostKind);
24684-
LLVM_DEBUG(dbgs()
24685-
<< "Minus vector FMul cost: " << FMulCost << "\n");
24686-
FMACost -= FMulCost;
24687-
}
24688-
}
24689-
if (FMACost.isValid())
24690-
VectorCost += FMACost;
24691-
else
24692-
VectorCost +=
24693-
TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24638+
VectorCost +=
24639+
TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
2469424640
if (RType != RedTy) {
2469524641
unsigned Opcode = Instruction::Trunc;
2469624642
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25358,7 +25304,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
2535825304
HorizontalReduction HorRdx;
2535925305
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
2536025306
return nullptr;
25361-
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
25307+
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
2536225308
};
2536325309
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
2536425310
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25503,7 +25449,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
2550325449
if (RedCost >= ScalarCost)
2550425450
return false;
2550525451

25506-
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
25452+
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
2550725453
};
2550825454
if (Candidates.size() == 1)
2550925455
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);

llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll

Lines changed: 43 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -709,25 +709,34 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
709709

710710

711711
define double @dot_product_fp64(ptr %a, ptr %b) {
712-
; CHECK-LABEL: @dot_product_fp64(
713-
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
714-
; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
715-
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
716-
; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
717-
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
718-
; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
719-
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
720-
; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
721-
; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
722-
; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
723-
; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
724-
; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
725-
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
726-
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
727-
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
728-
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
729-
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
730-
; CHECK-NEXT: ret double [[ADD_1]]
712+
; NON-POW2-LABEL: @dot_product_fp64(
713+
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
714+
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
715+
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
716+
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
717+
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
718+
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
719+
; NON-POW2-NEXT: ret double [[TMP4]]
720+
;
721+
; POW2-ONLY-LABEL: @dot_product_fp64(
722+
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
723+
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
724+
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
725+
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
726+
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
727+
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
728+
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
729+
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
730+
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
731+
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
732+
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
733+
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
734+
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
735+
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
736+
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
737+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
738+
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
739+
; POW2-ONLY-NEXT: ret double [[ADD_1]]
731740
;
732741
%gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
733742
%l.a.0 = load double, ptr %gep.a.0, align 4
@@ -784,13 +793,21 @@ entry:
784793
}
785794

786795
define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
787-
; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
788-
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
789-
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
790-
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
791-
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
792-
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
793-
; CHECK-NEXT: ret float [[ADD_1]]
796+
; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
797+
; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
798+
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
799+
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
800+
; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
801+
; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
802+
; NON-POW2-NEXT: ret float [[TMP5]]
803+
;
804+
; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
805+
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
806+
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
807+
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
808+
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
809+
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
810+
; POW2-ONLY-NEXT: ret float [[ADD_1]]
794811
;
795812
%mul.0 = fmul fast float %a, 10.0
796813
%mul.1 = fmul fast float %b, 10.0

llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,19 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
1010
define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
1111
; CHECK-LABEL: @test(
1212
; CHECK-NEXT: entry:
13-
; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
13+
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
14+
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
15+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
1416
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
15-
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
16-
; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
17-
; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
18-
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19-
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
20-
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
21-
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
22-
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]]
23-
; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]])
24-
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8
25-
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
26-
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]]
27-
; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]]
28-
; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]])
29-
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
30-
; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
17+
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
18+
; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
19+
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
20+
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
21+
; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
22+
; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
23+
; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
24+
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
25+
; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
3126
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
3227
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
3328
; CHECK-NEXT: ret void

llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll

Lines changed: 10 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
22
; RUN: opt -passes=slp-vectorizer -S -mcpu=corei7 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=SSE4
33
; RUN: opt -passes=slp-vectorizer -S -mcpu=bdver2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
4-
; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX2
4+
; RUN: opt -passes=slp-vectorizer -S -mcpu=core-avx2 -mtriple=x86_64-unknown-linux-gnu -slp-threshold=-2 < %s | FileCheck %s --check-prefixes=AVX
55

66
; This test checks for a case when a horizontal reduction of floating-point
77
; adds may look profitable, but is not because it eliminates generation of
@@ -26,27 +26,13 @@ define void @hr() {
2626
; AVX: loop:
2727
; AVX-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
2828
; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double
29-
; AVX-NEXT: [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
30-
; AVX-NEXT: [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
31-
; AVX-NEXT: [[ADD1:%.*]] = fadd fast double 0.000000e+00, [[ADD0]]
32-
; AVX-NEXT: [[ADD2:%.*]] = fadd fast double 0.000000e+00, [[ADD1]]
33-
; AVX-NEXT: [[ADD3]] = fadd fast double 0.000000e+00, [[ADD2]]
29+
; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
30+
; AVX-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
31+
; AVX-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
32+
; AVX-NEXT: [[ADD3]] = fadd fast double [[TMP3]], [[PHI0]]
3433
; AVX-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
3534
; AVX: exit:
3635
; AVX-NEXT: ret void
37-
;
38-
; AVX2-LABEL: @hr(
39-
; AVX2-NEXT: br label [[LOOP:%.*]]
40-
; AVX2: loop:
41-
; AVX2-NEXT: [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
42-
; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 0 to double
43-
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
44-
; AVX2-NEXT: [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
45-
; AVX2-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP2]])
46-
; AVX2-NEXT: [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
47-
; AVX2-NEXT: br i1 true, label [[EXIT:%.*]], label [[LOOP]]
48-
; AVX2: exit:
49-
; AVX2-NEXT: ret void
5036
;
5137
br label %loop
5238

@@ -84,24 +70,12 @@ define double @hr_or_mul() {
8470
;
8571
; AVX-LABEL: @hr_or_mul(
8672
; AVX-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double
87-
; AVX-NEXT: [[TMP4:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
73+
; AVX-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
74+
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
75+
; AVX-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
76+
; AVX-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
8877
; AVX-NEXT: [[ADD3:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
89-
; AVX-NEXT: [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
90-
; AVX-NEXT: [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD3]]
91-
; AVX-NEXT: [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
92-
; AVX-NEXT: [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
93-
; AVX-NEXT: [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
94-
; AVX-NEXT: [[ADD4:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
95-
; AVX-NEXT: ret double [[ADD4]]
96-
;
97-
; AVX2-LABEL: @hr_or_mul(
98-
; AVX2-NEXT: [[CVT0:%.*]] = uitofp i16 3 to double
99-
; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
100-
; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
101-
; AVX2-NEXT: [[TMP3:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[TMP2]]
102-
; AVX2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP3]])
103-
; AVX2-NEXT: [[OP_RDX:%.*]] = fadd fast double [[TMP4]], [[CVT0]]
104-
; AVX2-NEXT: ret double [[OP_RDX]]
78+
; AVX-NEXT: ret double [[ADD3]]
10579
;
10680
%cvt0 = uitofp i16 3 to double
10781
%mul0 = fmul fast double 7.000000e+00, %cvt0

0 commit comments

Comments
 (0)