llvm
diff --git a/‎llvm/docs/ReleaseNotes.md‎
Lines changed: 0 additions & 1 deletion b/‎llvm/docs/ReleaseNotes.md‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp‎
Lines changed: 6 additions & 158 deletions b/‎llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp‎
Lines changed: 6 additions & 158 deletions
diff --git a/‎llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll‎
Lines changed: 14 additions & 20 deletions b/‎llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll‎
Lines changed: 14 additions & 20 deletions
@@ -77,7 +77,6 @@ Changes to Vectorizers
 
 * Added initial support for copyable elements in SLP, which models copyable
   elements as add <element>, 0, i.e. uses identity constants for missing lanes.
-* SLP vectorizer supports initial recognition of FMA/FMAD pattern
 
 Changes to the AArch64 Backend
 ------------------------------
 
@@ -3883,7 +3883,6 @@ class BoUpSLP {
     enum CombinedOpcode {
       NotCombinedOp = -1,
       MinMax = Instruction::OtherOpsEnd + 1,
-      FMulAdd,
     };
     CombinedOpcode CombinedOp = NotCombinedOp;
 
@@ -4034,9 +4033,6 @@ class BoUpSLP {
     /// Returns true if any scalar in the list is a copyable element.
     bool hasCopyableElements() const { return !CopyableElements.empty(); }
 
-    /// Returns the state of the operations.
-    const InstructionsState &getOperations() const { return S; }
-
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -11991,82 +11987,6 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
   }
 }
 
-static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
-                                       const InstructionsState &S,
-                                       DominatorTree &DT, const DataLayout &DL,
-                                       TargetTransformInfo &TTI,
-                                       const TargetLibraryInfo &TLI) {
-  assert(all_of(VL,
-                [](Value *V) {
-                  return V->getType()->getScalarType()->isFloatingPointTy();
-                }) &&
-         "Can only convert to FMA for floating point types");
-  assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
-
-  auto CheckForContractable = [&](ArrayRef<Value *> VL) {
-    FastMathFlags FMF;
-    FMF.set();
-    for (Value *V : VL) {
-      auto *I = dyn_cast<Instruction>(V);
-      if (!I)
-        continue;
-      // TODO: support for copyable elements.
-      Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
-      if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
-        continue;
-      if (auto *FPCI = dyn_cast<FPMathOperator>(I))
-        FMF &= FPCI->getFastMathFlags();
-    }
-    return FMF.allowContract();
-  };
-  if (!CheckForContractable(VL))
-    return InstructionCost::getInvalid();
-  // fmul also should be contractable
-  InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
-  SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
-
-  InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
-  if (!OpS.valid())
-    return InstructionCost::getInvalid();
-  if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
-    return InstructionCost::getInvalid();
-  if (!CheckForContractable(Operands.front()))
-    return InstructionCost::getInvalid();
-  // Compare the costs.
-  InstructionCost FMulPlusFAddCost = 0;
-  InstructionCost FMACost = 0;
-  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  FastMathFlags FMF;
-  FMF.set();
-  for (Value *V : VL) {
-    auto *I = dyn_cast<Instruction>(V);
-    if (!I)
-      continue;
-    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
-      FMF &= FPCI->getFastMathFlags();
-    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
-  }
-  unsigned NumOps = 0;
-  for (auto [V, Op] : zip(VL, Operands.front())) {
-    auto *I = dyn_cast<Instruction>(Op);
-    if (!I || !I->hasOneUse()) {
-      if (auto *OpI = dyn_cast<Instruction>(V))
-        FMACost += TTI.getInstructionCost(OpI, CostKind);
-      if (I)
-        FMACost += TTI.getInstructionCost(I, CostKind);
-      continue;
-    }
-    ++NumOps;
-    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
-      FMF &= FPCI->getFastMathFlags();
-    FMulPlusFAddCost += TTI.getInstructionCost(I, CostKind);
-  }
-  Type *Ty = VL.front()->getType();
-  IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
-  FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
-  return FMACost < FMulPlusFAddCost ? FMACost : InstructionCost::getInvalid();
-}
-
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   BaseGraphSize = VectorizableTree.size();
@@ -12435,25 +12355,6 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
-    case Instruction::FSub:
-    case Instruction::FAdd: {
-      // Check if possible to convert (a*b)+c to fma.
-      if (E.State != TreeEntry::Vectorize ||
-          !E.getOperations().isAddSubLikeOp())
-        break;
-      if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
-               .isValid())
-        break;
-      // This node is a fmuladd node.
-      E.CombinedOp = TreeEntry::FMulAdd;
-      TreeEntry *FMulEntry = getOperandEntry(&E, 0);
-      if (FMulEntry->UserTreeIndex &&
-          FMulEntry->State == TreeEntry::Vectorize) {
-        // The FMul node is part of the combined fmuladd node.
-        FMulEntry->State = TreeEntry::CombinedVectorize;
-      }
-      break;
-    }
     default:
       break;
     }
@@ -13686,11 +13587,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     }
     return IntrinsicCost;
   };
-  auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
-                                         Instruction *VI) {
-    InstructionCost Cost = canConvertToFMA(VI, S, *DT, *DL, TTI, *TLI);
-    return Cost;
-  };
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -14031,30 +13927,6 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
-  case TreeEntry::FMulAdd: {
-    auto GetScalarCost = [&](unsigned Idx) {
-      if (isa<PoisonValue>(UniqueValues[Idx]))
-        return InstructionCost(TTI::TCC_Free);
-      return GetFMulAddCost(E->getOperations(),
-                            cast<Instruction>(UniqueValues[Idx]));
-    };
-    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
-      FastMathFlags FMF;
-      FMF.set();
-      for (Value *V : E->Scalars) {
-        if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
-          FMF &= FPCI->getFastMathFlags();
-          if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
-            FMF &= FPCIOp->getFastMathFlags();
-        }
-      }
-      IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
-                                  {VecTy, VecTy, VecTy}, FMF);
-      InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
-      return VecCost + CommonCost;
-    };
-    return GetCostDiff(GetScalarCost, GetVectorCost);
-  }
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -14092,16 +13964,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       }
       TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);
       TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);
-      InstructionCost ScalarCost = TTI->getArithmeticInstrCost(
-          ShuffleOrOp, OrigScalarTy, CostKind, Op1Info, Op2Info, Operands);
-      if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
-          I && (ShuffleOrOp == Instruction::FAdd ||
-                ShuffleOrOp == Instruction::FSub)) {
-        InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
-        if (IntrinsicCost.isValid())
-          ScalarCost = IntrinsicCost;
-      }
-      return ScalarCost;
+      return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
+                                         Op1Info, Op2Info, Operands);
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
@@ -22730,21 +22594,11 @@ class HorizontalReduction {
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
                                  ScalarEvolution &SE, const DataLayout &DL,
-                                 const TargetLibraryInfo &TLI,
-                                 DominatorTree &DT, TargetTransformInfo &TTI) {
+                                 const TargetLibraryInfo &TLI) {
     RdxKind = HorizontalReduction::getRdxKind(Root);
     if (!isVectorizable(RdxKind, Root))
       return false;
 
-    // FMA reduction root - skip.
-    auto CheckForFMA = [&](Instruction *I) {
-      return RdxKind == RecurKind::FAdd &&
-             canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
-                 .isValid();
-    };
-    if (CheckForFMA(Root))
-      return false;
-
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     Type *Ty = Root->getType();
@@ -22782,7 +22636,7 @@ class HorizontalReduction {
         // Also, do not try to reduce const values, if the operation is not
         // foldable.
         if (!EdgeInst || Level > RecursionMaxDepth ||
-            getRdxKind(EdgeInst) != RdxKind || CheckForFMA(EdgeInst) ||
+            getRdxKind(EdgeInst) != RdxKind ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -24351,13 +24205,13 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   Stack.emplace(SelectRoot(), 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
-  auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
+  auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
     if (R.isAnalyzedReductionRoot(Inst))
       return nullptr;
     if (!isReductionCandidate(Inst))
       return nullptr;
     HorizontalReduction HorRdx;
-    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
+    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
       return nullptr;
     return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };
@@ -24423,12 +24277,6 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
 
   if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
     return false;
-  // Skip potential FMA candidates.
-  if ((I->getOpcode() == Instruction::FAdd ||
-       I->getOpcode() == Instruction::FSub) &&
-      canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
-          .isValid())
-    return false;
 
   Value *P = I->getParent();
 
 
@@ -8,18 +8,15 @@ target triple = "aarch64--linux-gnu"
 define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
-; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL11]], [[MUL12]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -50,18 +47,15 @@ for.end27:
 define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
-; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
-; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
-; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
-; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
-; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
+; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27: