Skip to content

Commit 005f0fa

Browse files
committed
[SLP]Improved/fixed FMAD support in reductions
In the initial patch for FMAD, potential FMAD nodes were completely excluded from the reduction analysis for the smaller patch. But it may cause regressions. This patch adds better detection of scalar FMAD reduction operations and tries to correctly calculate the costs of the FMAD reduction operations (also, excluding the costs of the scalar fmuls) and split reduction operations, combined with regular FMADs. Fixed the handling for reduced values with many uses. Reviewers: RKSimon, gregbedwell, hiraditya Reviewed By: RKSimon Pull Request: #152787
1 parent cea2c86 commit 005f0fa

File tree

5 files changed

+189
-72
lines changed

5 files changed

+189
-72
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23842,7 +23842,8 @@ class HorizontalReduction {
2384223842

2384323843
/// Attempt to vectorize the tree found by matchAssociativeReduction.
2384423844
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
23845-
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
23845+
const TargetLibraryInfo &TLI, AssumptionCache *AC,
23846+
DominatorTree &DT) {
2384623847
constexpr unsigned RegMaxNumber = 4;
2384723848
constexpr unsigned RedValsMaxNumber = 128;
2384823849
// If there are a sufficient number of reduction values, reduce
@@ -24241,7 +24242,7 @@ class HorizontalReduction {
2424124242

2424224243
// Estimate cost.
2424324244
InstructionCost ReductionCost =
24244-
getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V);
24245+
getReductionCost(TTI, VL, IsCmpSelMinMax, RdxFMF, V, DT, DL, TLI);
2424524246
InstructionCost Cost = V.getTreeCost(VL, ReductionCost);
2424624247
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost
2424724248
<< " for reduction\n");
@@ -24546,7 +24547,9 @@ class HorizontalReduction {
2454624547
InstructionCost getReductionCost(TargetTransformInfo *TTI,
2454724548
ArrayRef<Value *> ReducedVals,
2454824549
bool IsCmpSelMinMax, FastMathFlags FMF,
24549-
const BoUpSLP &R) {
24550+
const BoUpSLP &R, DominatorTree &DT,
24551+
const DataLayout &DL,
24552+
const TargetLibraryInfo &TLI) {
2455024553
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
2455124554
Type *ScalarTy = ReducedVals.front()->getType();
2455224555
unsigned ReduxWidth = ReducedVals.size();
@@ -24571,6 +24574,22 @@ class HorizontalReduction {
2457124574
for (User *U : RdxVal->users()) {
2457224575
auto *RdxOp = cast<Instruction>(U);
2457324576
if (hasRequiredNumberOfUses(IsCmpSelMinMax, RdxOp)) {
24577+
if (RdxKind == RecurKind::FAdd) {
24578+
InstructionCost FMACost = canConvertToFMA(
24579+
RdxOp, getSameOpcode(RdxOp, TLI), DT, DL, *TTI, TLI);
24580+
if (FMACost.isValid()) {
24581+
LLVM_DEBUG(dbgs() << "FMA cost: " << FMACost << "\n");
24582+
if (auto *I = dyn_cast<Instruction>(RdxVal)) {
24583+
// Also, exclude scalar fmul cost.
24584+
InstructionCost FMulCost =
24585+
TTI->getInstructionCost(I, CostKind);
24586+
LLVM_DEBUG(dbgs() << "Minus FMul cost: " << FMulCost << "\n");
24587+
FMACost -= FMulCost;
24588+
}
24589+
ScalarCost += FMACost;
24590+
continue;
24591+
}
24592+
}
2457424593
ScalarCost += TTI->getInstructionCost(RdxOp, CostKind);
2457524594
continue;
2457624595
}
@@ -24635,8 +24654,45 @@ class HorizontalReduction {
2463524654
auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or(
2463624655
std::make_pair(RedTy, true));
2463724656
VectorType *RVecTy = getWidenedType(RType, ReduxWidth);
24638-
VectorCost +=
24639-
TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
24657+
InstructionCost FMACost = InstructionCost::getInvalid();
24658+
if (RdxKind == RecurKind::FAdd) {
24659+
// Check if the reduction operands can be converted to FMA.
24660+
SmallVector<Value *> Ops;
24661+
FastMathFlags FMF;
24662+
FMF.set();
24663+
for (Value *RdxVal : ReducedVals) {
24664+
if (!RdxVal->hasOneUse()) {
24665+
Ops.clear();
24666+
break;
24667+
}
24668+
if (auto *FPCI = dyn_cast<FPMathOperator>(RdxVal))
24669+
FMF &= FPCI->getFastMathFlags();
24670+
Ops.push_back(RdxVal->user_back());
24671+
}
24672+
if (!Ops.empty()) {
24673+
FMACost = canConvertToFMA(Ops, getSameOpcode(Ops, TLI), DT, DL,
24674+
*TTI, TLI);
24675+
if (FMACost.isValid()) {
24676+
// Calculate actual FMAD cost.
24677+
IntrinsicCostAttributes ICA(Intrinsic::fmuladd, RVecTy,
24678+
{RVecTy, RVecTy, RVecTy}, FMF);
24679+
FMACost = TTI->getIntrinsicInstrCost(ICA, CostKind);
24680+
24681+
LLVM_DEBUG(dbgs() << "Vector FMA cost: " << FMACost << "\n");
24682+
// Also, exclude vector fmul cost.
24683+
InstructionCost FMulCost = TTI->getArithmeticInstrCost(
24684+
Instruction::FMul, RVecTy, CostKind);
24685+
LLVM_DEBUG(dbgs()
24686+
<< "Minus vector FMul cost: " << FMulCost << "\n");
24687+
FMACost -= FMulCost;
24688+
}
24689+
}
24690+
}
24691+
if (FMACost.isValid())
24692+
VectorCost += FMACost;
24693+
else
24694+
VectorCost +=
24695+
TTI->getArithmeticInstrCost(RdxOpcode, RVecTy, CostKind);
2464024696
if (RType != RedTy) {
2464124697
unsigned Opcode = Instruction::Trunc;
2464224698
if (RedTy->getScalarSizeInBits() > RType->getScalarSizeInBits())
@@ -25304,7 +25360,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
2530425360
HorizontalReduction HorRdx;
2530525361
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
2530625362
return nullptr;
25307-
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
25363+
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
2530825364
};
2530925365
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
2531025366
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25449,7 +25505,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
2544925505
if (RedCost >= ScalarCost)
2545025506
return false;
2545125507

25452-
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
25508+
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
2545325509
};
2545425510
if (Candidates.size() == 1)
2545525511
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
3+
4+
define double @test(ptr %0, ptr %1) {
5+
; CHECK-LABEL: define double @test(
6+
; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
7+
; CHECK-NEXT: [[ENTRY:.*:]]
8+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 144
9+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 232
10+
; CHECK-NEXT: [[TMP4:%.*]] = load <4 x double>, ptr [[TMP2]], align 8
11+
; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[TMP0]], align 8
12+
; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[TMP3]], align 8
13+
; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP1]], align 8
14+
; CHECK-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
15+
; CHECK-NEXT: [[TMP9:%.*]] = fmul reassoc nsz <4 x double> [[TMP4]], splat (double 1.000000e+00)
16+
; CHECK-NEXT: [[TMP10:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP9]])
17+
; CHECK-NEXT: [[TMP11:%.*]] = fmul double [[TMP5]], 2.000000e+00
18+
; CHECK-NEXT: [[OP_RDX5:%.*]] = fadd reassoc nsz double [[TMP11]], [[TMP8]]
19+
; CHECK-NEXT: [[OP_RDX6:%.*]] = fadd reassoc nsz double [[OP_RDX5]], [[TMP6]]
20+
; CHECK-NEXT: [[OP_RDX7:%.*]] = fadd reassoc nsz double [[OP_RDX6]], [[TMP7]]
21+
; CHECK-NEXT: [[OP_RDX8:%.*]] = fadd reassoc nsz double [[OP_RDX7]], [[TMP10]]
22+
; CHECK-NEXT: ret double [[OP_RDX8]]
23+
;
24+
entry:
25+
%2 = getelementptr i8, ptr %1, i64 144
26+
%3 = getelementptr i8, ptr %1, i64 152
27+
%4 = getelementptr i8, ptr %1, i64 160
28+
%5 = getelementptr i8, ptr %1, i64 168
29+
%6 = getelementptr i8, ptr %1, i64 232
30+
%7 = load double, ptr %2, align 8
31+
%8 = load double, ptr %3, align 8
32+
%9 = fadd reassoc nsz double %8, %7
33+
%10 = load double, ptr %4, align 8
34+
%11 = fadd reassoc nsz double %10, %9
35+
%12 = load double, ptr %5, align 8
36+
%13 = fadd reassoc nsz double %12, %11
37+
%14 = load double, ptr %0, align 8
38+
%15 = fadd reassoc nsz double %14, %13
39+
%16 = fadd reassoc nsz double %14, %15
40+
%17 = load double, ptr %6, align 8
41+
%18 = fadd reassoc nsz double %17, %16
42+
%19 = load double, ptr %1, align 8
43+
%20 = fadd reassoc nsz double %19, %18
44+
%21 = load double, ptr %0, align 8
45+
%22 = fadd reassoc nsz double %21, %20
46+
ret double %22
47+
}

llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll

Lines changed: 26 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -709,34 +709,25 @@ define float @dot_product_fp32_reorder(ptr %a, ptr %b) {
709709

710710

711711
define double @dot_product_fp64(ptr %a, ptr %b) {
712-
; NON-POW2-LABEL: @dot_product_fp64(
713-
; NON-POW2-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
714-
; NON-POW2-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
715-
; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x double>, ptr [[GEP_A_0]], align 4
716-
; NON-POW2-NEXT: [[TMP2:%.*]] = load <3 x double>, ptr [[GEP_B_0]], align 4
717-
; NON-POW2-NEXT: [[TMP3:%.*]] = fmul fast <3 x double> [[TMP1]], [[TMP2]]
718-
; NON-POW2-NEXT: [[TMP4:%.*]] = call fast double @llvm.vector.reduce.fadd.v3f64(double 0.000000e+00, <3 x double> [[TMP3]])
719-
; NON-POW2-NEXT: ret double [[TMP4]]
720-
;
721-
; POW2-ONLY-LABEL: @dot_product_fp64(
722-
; POW2-ONLY-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
723-
; POW2-ONLY-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
724-
; POW2-ONLY-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
725-
; POW2-ONLY-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
726-
; POW2-ONLY-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
727-
; POW2-ONLY-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
728-
; POW2-ONLY-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
729-
; POW2-ONLY-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
730-
; POW2-ONLY-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
731-
; POW2-ONLY-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
732-
; POW2-ONLY-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
733-
; POW2-ONLY-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
734-
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
735-
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
736-
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
737-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
738-
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
739-
; POW2-ONLY-NEXT: ret double [[ADD_1]]
712+
; CHECK-LABEL: @dot_product_fp64(
713+
; CHECK-NEXT: [[GEP_A_0:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i32 0
714+
; CHECK-NEXT: [[L_A_0:%.*]] = load double, ptr [[GEP_A_0]], align 4
715+
; CHECK-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds double, ptr [[A]], i32 1
716+
; CHECK-NEXT: [[L_A_1:%.*]] = load double, ptr [[GEP_A_1]], align 4
717+
; CHECK-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds double, ptr [[A]], i32 2
718+
; CHECK-NEXT: [[L_A_2:%.*]] = load double, ptr [[GEP_A_2]], align 4
719+
; CHECK-NEXT: [[GEP_B_0:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i32 0
720+
; CHECK-NEXT: [[L_B_0:%.*]] = load double, ptr [[GEP_B_0]], align 4
721+
; CHECK-NEXT: [[GEP_B_1:%.*]] = getelementptr inbounds double, ptr [[B]], i32 1
722+
; CHECK-NEXT: [[L_B_1:%.*]] = load double, ptr [[GEP_B_1]], align 4
723+
; CHECK-NEXT: [[GEP_B_2:%.*]] = getelementptr inbounds double, ptr [[B]], i32 2
724+
; CHECK-NEXT: [[L_B_2:%.*]] = load double, ptr [[GEP_B_2]], align 4
725+
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast double [[L_A_0]], [[L_B_0]]
726+
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast double [[L_A_1]], [[L_B_1]]
727+
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast double [[L_A_2]], [[L_B_2]]
728+
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast double [[MUL_0]], [[MUL_1]]
729+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast double [[ADD_0]], [[MUL_2]]
730+
; CHECK-NEXT: ret double [[ADD_1]]
740731
;
741732
%gep.a.0 = getelementptr inbounds double, ptr %a, i32 0
742733
%l.a.0 = load double, ptr %gep.a.0, align 4
@@ -793,21 +784,13 @@ entry:
793784
}
794785

795786
define float @reduce_fadd_after_fmul_of_buildvec(float %a, float %b, float %c) {
796-
; NON-POW2-LABEL: @reduce_fadd_after_fmul_of_buildvec(
797-
; NON-POW2-NEXT: [[TMP1:%.*]] = insertelement <3 x float> poison, float [[A:%.*]], i32 0
798-
; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> [[TMP1]], float [[B:%.*]], i32 1
799-
; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> [[TMP2]], float [[C:%.*]], i32 2
800-
; NON-POW2-NEXT: [[TMP4:%.*]] = fmul fast <3 x float> [[TMP3]], splat (float 1.000000e+01)
801-
; NON-POW2-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v3f32(float 0.000000e+00, <3 x float> [[TMP4]])
802-
; NON-POW2-NEXT: ret float [[TMP5]]
803-
;
804-
; POW2-ONLY-LABEL: @reduce_fadd_after_fmul_of_buildvec(
805-
; POW2-ONLY-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
806-
; POW2-ONLY-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
807-
; POW2-ONLY-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
808-
; POW2-ONLY-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
809-
; POW2-ONLY-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
810-
; POW2-ONLY-NEXT: ret float [[ADD_1]]
787+
; CHECK-LABEL: @reduce_fadd_after_fmul_of_buildvec(
788+
; CHECK-NEXT: [[MUL_0:%.*]] = fmul fast float [[A:%.*]], 1.000000e+01
789+
; CHECK-NEXT: [[MUL_1:%.*]] = fmul fast float [[B:%.*]], 1.000000e+01
790+
; CHECK-NEXT: [[MUL_2:%.*]] = fmul fast float [[C:%.*]], 1.000000e+01
791+
; CHECK-NEXT: [[ADD_0:%.*]] = fadd fast float [[MUL_0]], [[MUL_1]]
792+
; CHECK-NEXT: [[ADD_1:%.*]] = fadd fast float [[ADD_0]], [[MUL_2]]
793+
; CHECK-NEXT: ret float [[ADD_1]]
811794
;
812795
%mul.0 = fmul fast float %a, 10.0
813796
%mul.1 = fmul fast float %b, 10.0

llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,19 +10,24 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg
1010
define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) {
1111
; CHECK-LABEL: @test(
1212
; CHECK-NEXT: entry:
13-
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0
14-
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer
15-
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr double, <8 x ptr> [[TMP1]], <8 x i64> <i64 1, i64 3, i64 5, i64 7, i64 9, i64 11, i64 13, i64 15>
13+
; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1
1614
; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16
17-
; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> splat (i1 true), <8 x double> poison)
18-
; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8
19-
; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP3]]
20-
; CHECK-NEXT: [[TMP6:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
21-
; CHECK-NEXT: [[TMP7:%.*]] = fmul fast <8 x double> [[TMP6]], [[TMP3]]
22-
; CHECK-NEXT: [[TMP8:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP7]])
23-
; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP5]])
24-
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0
25-
; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP9]], i64 1
15+
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[GEP2_0]], align 8
16+
; CHECK-NEXT: [[GEP2_4:%.*]] = getelementptr inbounds double, ptr [[ARG1]], i64 20
17+
; CHECK-NEXT: [[TMP1:%.*]] = call <15 x double> @llvm.masked.load.v15f64.p0(ptr [[GEP1_0]], i32 8, <15 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true>, <15 x double> poison)
18+
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
19+
; CHECK-NEXT: [[TMP3:%.*]] = load <8 x double>, ptr [[ARG1]], align 8
20+
; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <8 x double> [[TMP3]], [[TMP2]]
21+
; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP2]], <8 x double> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
22+
; CHECK-NEXT: [[TMP6:%.*]] = fmul fast <4 x double> [[TMP0]], [[TMP5]]
23+
; CHECK-NEXT: [[TMP7:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP4]])
24+
; CHECK-NEXT: [[TMP8:%.*]] = load <4 x double>, ptr [[GEP2_4]], align 8
25+
; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <15 x double> [[TMP1]], <15 x double> poison, <4 x i32> <i32 8, i32 10, i32 12, i32 14>
26+
; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x double> [[TMP8]], [[TMP9]]
27+
; CHECK-NEXT: [[RDX_OP:%.*]] = fadd fast <4 x double> [[TMP6]], [[TMP10]]
28+
; CHECK-NEXT: [[TMP11:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[RDX_OP]])
29+
; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP7]], i64 0
30+
; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP11]], i64 1
2631
; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> <i64 0, i64 16>
2732
; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> splat (i1 true))
2833
; CHECK-NEXT: ret void

0 commit comments

Comments
 (0)