diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 033910121a54f..67a03478b85ef 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20363,6 +20363,77 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
+Vector Partial Reduction Intrinsics
+-----------------------------------
+
+Partial reductions of vectors can be expressed using the intrinsics described in
+this section. Each one reduces the concatenation of the two vector arguments
+down to the number of elements of the result vector type.
+
+Other than the reduction operator (e.g. add, fadd), the way in which the
+concatenated arguments is reduced is entirely unspecified. By their nature these
+intrinsics are not expected to be useful in isolation but can instead be used to
+implement the first phase of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase is required to calculate the final scalar
+result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
+'``llvm.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is an integer vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is a floating-point vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+Semantics:
+""""""""""
+
+As the way in which the arguments to this floating-point intrinsic are reduced
+is unspecified, this intrinsic will assume floating-point reassociation and
+contraction can be leveraged to implement the reduction, which may result in
+variations to the results due to reordering or by lowering to different
+instructions (including combining multiple instructions into a single one).
+
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -20736,50 +20807,6 @@ Note that it has the following implications:
 -  If ``%cnt`` is non-zero, the return value is non-zero as well.
 -  If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
 
-'``llvm.vector.partial.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-This is an overloaded intrinsic.
-
-::
-
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
-
-Overview:
-"""""""""
-
-The '``llvm.vector.partial.reduce.add.*``' intrinsics reduce the
-concatenation of the two vector arguments down to the number of elements of the
-result vector type.
-
-Arguments:
-""""""""""
-
-The first argument is an integer vector with the same type as the result.
-
-The second argument is a vector with a length that is a known integer multiple
-of the result's type, while maintaining the same element type.
-
-Semantics:
-""""""""""
-
-Other than the reduction operator (e.g. add) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
-
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5d3b233ed6b6a..08b86f02c48f7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -222,7 +222,12 @@ class TargetTransformInfoImplBase;
 /// for IR-level transformations.
 class TargetTransformInfo {
 public:
-  enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
+  enum PartialReductionExtendKind {
+    PR_None,
+    PR_SignExtend,
+    PR_ZeroExtend,
+    PR_FPExtend
+  };
 
   /// Get the kind of extension that an instruction represents.
   LLVM_ABI static PartialReductionExtendKind
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ff3dd0d4c3c51..1a3fd27e64c4f 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1516,6 +1516,7 @@ enum NodeType {
   PARTIAL_REDUCE_SMLA,  // sext, sext
   PARTIAL_REDUCE_UMLA,  // zext, zext
   PARTIAL_REDUCE_SUMLA, // sext, zext
+  PARTIAL_REDUCE_FMLA,  // fpext, fpext
 
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 69713d0d84011..ed5c7572ab592 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1938,6 +1938,10 @@ LLVM_ABI bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false);
 /// be zero.
 LLVM_ABI bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false);
 
+/// Return true if the value is a constant floating-point value, or a splatted
+/// vector of a constant floating-point value, of 1.0 (with no undefs).
+LLVM_ABI bool isOneOrOneSplatFP(SDValue V, bool AllowUndefs = false);
+
 /// Return true if the value is a constant -1 integer or a splatted vector of a
 /// constant -1 integer (with no undefs).
 /// Does not permit build vector implicit truncation.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..92405bebbca46 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1664,7 +1664,7 @@ class LLVM_ABI TargetLoweringBase {
   LegalizeAction getPartialReduceMLAAction(unsigned Opc, EVT AccVT,
                                            EVT InputVT) const {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     PartialReduceActionTypes Key = {Opc, AccVT.getSimpleVT().SimpleTy,
                                     InputVT.getSimpleVT().SimpleTy};
     auto It = PartialReduceMLAActions.find(Key);
@@ -2766,7 +2766,7 @@ class LLVM_ABI TargetLoweringBase {
   void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT,
                                  LegalizeAction Action) {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     assert(AccVT.isValid() && InputVT.isValid() &&
            "setPartialReduceMLAAction types aren't valid");
     PartialReduceActionTypes Key = {Opc, AccVT.SimpleTy, InputVT.SimpleTy};
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 12d1c2528f977..5163d2052561e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2799,6 +2799,10 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
                                                           [IntrNoMem,
                                                            IntrSpeculatable]>;
 
+def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                           [llvm_anyfloat_ty, llvm_anyfloat_ty],
+                                                           [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..a9750a5ab03f9 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -527,6 +527,8 @@ def partial_reduce_smla : SDNode<"ISD::PARTIAL_REDUCE_SMLA",
                                  SDTPartialReduceMLA>;
 def partial_reduce_sumla : SDNode<"ISD::PARTIAL_REDUCE_SUMLA",
                                  SDTPartialReduceMLA>;
+def partial_reduce_fmla : SDNode<"ISD::PARTIAL_REDUCE_FMLA",
+                                 SDTPartialReduceMLA>;
 
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf62623099a97..b3e677d7d8731 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1005,6 +1005,8 @@ TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
     return PR_SignExtend;
   if (isa<ZExtInst>(I))
     return PR_ZeroExtend;
+  if (isa<FPExtInst>(I))
+    return PR_FPExtend;
   return PR_None;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 310d35d9b1d1e..bf17e67276df2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
                                 return visitPARTIAL_REDUCE_MLA(N);
   case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
@@ -12988,6 +12989,9 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
 //
 // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
 // -> partial_reduce_*mla(acc, x, C)
+//
+// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
+// -> partial_reduce_fmla(acc, a, b)
 SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDLoc DL(N);
   auto *Context = DAG.getContext();
@@ -12996,7 +13000,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDValue Op2 = N->getOperand(2);
 
   unsigned Opc = Op1->getOpcode();
-  if (Opc != ISD::MUL && Opc != ISD::SHL)
+  if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL)
     return SDValue();
 
   SDValue LHS = Op1->getOperand(0);
@@ -13015,13 +13019,16 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
     Opc = ISD::MUL;
   }
 
-  APInt C;
-  if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) ||
-      !C.isOne())
+  if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) &&
+      !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2)))
     return SDValue();
 
+  auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
+    return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND);
+  };
+
   unsigned LHSOpcode = LHS->getOpcode();
-  if (!ISD::isExtOpcode(LHSOpcode))
+  if (!IsIntOrFPExtOpcode(LHSOpcode))
     return SDValue();
 
   SDValue LHSExtOp = LHS->getOperand(0);
@@ -13029,6 +13036,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 
   // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
   // -> partial_reduce_*mla(acc, x, C)
+  APInt C;
   if (ISD::isConstantSplatVector(RHS.getNode(), C)) {
     // TODO: Make use of partial_reduce_sumla here
     APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits());
@@ -13053,7 +13061,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   unsigned RHSOpcode = RHS->getOpcode();
-  if (!ISD::isExtOpcode(RHSOpcode))
+  if (!IsIntOrFPExtOpcode(RHSOpcode))
     return SDValue();
 
   SDValue RHSExtOp = RHS->getOperand(0);
@@ -13070,6 +13078,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) {
     NewOpc = ISD::PARTIAL_REDUCE_SUMLA;
     std::swap(LHSExtOp, RHSExtOp);
+  } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) {
+    NewOpc = ISD::PARTIAL_REDUCE_FMLA;
   } else
     return SDValue();
   // For a 2-stage extend the signedness of both of the extends must match
@@ -13097,30 +13107,33 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
 // partial.reduce.sumla(acc, sext(op), splat(1))
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
+// partial.reduce.fmla(acc, fpext(op), splat(1.0))
+// -> partial.reduce.fmla(acc, op, splat(1.0))
 SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDLoc DL(N);
   SDValue Acc = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
 
-  APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+  if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
-  if (!ISD::isExtOpcode(Op1Opcode))
+  if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
+  bool Op1IsSigned =
+      Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
   if (Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
-  unsigned NewOpcode =
-      Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+  unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                           ? ISD::PARTIAL_REDUCE_FMLA
+                       : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA
+                                     : ISD::PARTIAL_REDUCE_UMLA;
 
   SDValue UnextOp1 = Op1.getOperand(0);
   EVT UnextOp1VT = UnextOp1.getValueType();
@@ -13130,8 +13143,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
           TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
     return SDValue();
 
+  SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                         ? DAG.getConstantFP(1, DL, UnextOp1VT)
+                         : DAG.getConstant(1, DL, UnextOp1VT);
+
   return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
-                     DAG.getConstant(1, DL, UnextOp1VT));
+                     Constant);
 }
 
 SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8e423c4f83b38..94751be5b7986 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Action =
         TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0),
                                       Node->getOperand(1).getValueType());
@@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Results.push_back(TLI.expandPartialReduceMLA(Node, DAG));
     return;
   case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 3b5f83f7c089a..4b40b32621418 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1459,6 +1459,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
     break;
   case ISD::GET_ACTIVE_LANE_MASK:
@@ -3674,6 +3675,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Res = SplitVecOp_PARTIAL_REDUCE_MLA(N);
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 90edaf3ef5471..6f2a009676442 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8404,7 +8404,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   }
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
-  case ISD::PARTIAL_REDUCE_SUMLA: {
+  case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA: {
     [[maybe_unused]] EVT AccVT = N1.getValueType();
     [[maybe_unused]] EVT Input1VT = N2.getValueType();
     [[maybe_unused]] EVT Input2VT = N3.getValueType();
@@ -13054,6 +13055,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
   return C && C->isOne();
 }
 
+bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) {
+  ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs);
+  return C && C->isExactlyValue(1.0);
+}
+
 bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
   N = peekThroughBitcasts(N);
   unsigned BitWidth = N.getScalarValueSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 20a0efd3afa1c..3b7dadbf32124 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8081,6 +8081,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                          Input, DAG.getConstant(1, sdl, Input.getValueType())));
     return;
   }
+  case Intrinsic::vector_partial_reduce_fadd: {
+    SDValue Acc = getValue(I.getOperand(0));
+    SDValue Input = getValue(I.getOperand(1));
+    setValue(&I, DAG.getNode(
+                     ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
+                     Input, DAG.getConstantFP(1.0, sdl, Input.getValueType())));
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 39cbfad6d0be1..8495e9691c77b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -588,6 +588,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     return "partial_reduce_smla";
   case ISD::PARTIAL_REDUCE_SUMLA:
     return "partial_reduce_sumla";
+  case ISD::PARTIAL_REDUCE_FMLA:
+    return "partial_reduce_fmla";
   case ISD::LOOP_DEPENDENCE_WAR_MASK:
     return "loop_dep_war";
   case ISD::LOOP_DEPENDENCE_RAW_MASK:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 920dff935daed..0d9a133000bd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12061,22 +12061,32 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
       EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
                        MulOpVT.getVectorElementCount());
 
-  unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
-                      ? ISD::ZERO_EXTEND
-                      : ISD::SIGN_EXTEND;
-  unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA
-                      ? ISD::SIGN_EXTEND
-                      : ISD::ZERO_EXTEND;
+  unsigned ExtOpcLHS, ExtOpcRHS;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::PARTIAL_REDUCE_UMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_SMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_FMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND;
+    break;
+  }
 
   if (ExtMulOpVT != MulOpVT) {
     MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
     MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS);
   }
   SDValue Input = MulLHS;
-  APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+  if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) {
+    if (!llvm::isOneOrOneSplatFP(MulRHS))
+      Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+  } else if (!llvm::isOneOrOneSplat(MulRHS)) {
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+  }
 
   unsigned Stride = AccVT.getVectorMinNumElements();
   unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride;
@@ -12086,10 +12096,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   for (unsigned I = 0; I < ScaleFactor; I++)
     Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride));
 
+  unsigned FlatNode =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD;
+
   // Flatten the subvector tree
   while (Subvectors.size() > 1) {
     Subvectors.push_back(
-        DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]}));
+        DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]}));
     Subvectors.pop_front();
     Subvectors.pop_front();
   }
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 03da1547b652f..7efba21238458 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6578,6 +6578,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::vector_partial_reduce_fadd:
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a81de5c5adc34..abe1af23b4e40 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1921,6 +1921,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
       setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
     }
+
+    // Handle floating-point partial reduction
+    if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
+                                MVT::nxv8f16, Legal);
+    }
   }
 
   // Handle non-aliasing elements mask
@@ -2288,6 +2294,11 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
                                 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
+  if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
+    setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
+                              MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
+  }
+
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ABDS, VT, Default);
   setOperationAction(ISD::ABDU, VT, Default);
@@ -7911,6 +7922,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     return LowerPARTIAL_REDUCE_MLA(Op, DAG);
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 98a128e582866..44628eacc83c0 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                                node:$Zm)
                                ]>;
 
+def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                            [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm),
+                             (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm)
+                            ]>;
+
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3>
@@ -4251,7 +4256,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 let Predicates = [HasSVE2p1_or_SME2] in {
 defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 
-defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
+defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e3370d31a0e39..497d4fb4d3392 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5640,7 +5640,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
       (!ST->isNeonAvailable() || !ST->hasDotProd()))
     return Invalid;
 
-  if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
+  if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
+       Opcode != Instruction::FAdd) ||
       OpAExtend == TTI::PR_None)
     return Invalid;
 
@@ -5650,7 +5651,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
 
   // We only support multiply binary operations for now, and for muls we
   // require the types being extended to be the same.
-  if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
+  if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
+                InputTypeA != InputTypeB))
     return Invalid;
 
   bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index adf27bed3d749..129083eb0eb27 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7967,7 +7967,8 @@ bool VPRecipeBuilder::getScaledReductions(
         continue;
       }
       Value *ExtOp;
-      if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
+      if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))) &&
+          !match(OpI, m_FPExt(m_Value(ExtOp))))
         return false;
       Exts[I] = cast<Instruction>(OpI);
 
@@ -8138,6 +8139,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
     return nullptr;
 
   unsigned ReductionOpcode = Reduction->getOpcode();
+  if (ReductionOpcode == Instruction::FAdd && !Reduction->hasAllowReassoc())
+    return nullptr;
   if (ReductionOpcode == Instruction::Sub) {
     auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
     SmallVector<VPValue *, 2> Ops;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1f1b42bb9c19f..cd08f95254de5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -322,6 +322,8 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
       return TTI::PR_ZeroExtend;
     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
       return TTI::PR_SignExtend;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::FPExt)
+      return TTI::PR_FPExtend;
     return TTI::PR_None;
   };
 
@@ -374,8 +376,9 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
 void VPPartialReductionRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
 
-  assert(getOpcode() == Instruction::Add &&
-         "Unhandled partial reduction opcode");
+  assert(
+      (getOpcode() == Instruction::Add || getOpcode() == Instruction::FAdd) &&
+      "Unhandled partial reduction opcode");
 
   Value *BinOpVal = State.get(getOperand(1));
   Value *PhiVal = State.get(getOperand(0));
@@ -383,9 +386,20 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
 
   Type *RetTy = PhiVal->getType();
 
-  CallInst *V =
-      Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
-                              {PhiVal, BinOpVal}, nullptr, "partial.reduce");
+  enum llvm::Intrinsic::IndependentIntrinsics PRIntrinsic;
+  switch (getOpcode()) {
+  case Instruction::Add: {
+    PRIntrinsic = Intrinsic::vector_partial_reduce_add;
+    break;
+  }
+  case Instruction::FAdd: {
+    PRIntrinsic = Intrinsic::vector_partial_reduce_fadd;
+    break;
+  }
+  }
+
+  CallInst *V = Builder.CreateIntrinsic(RetTy, PRIntrinsic, {PhiVal, BinOpVal},
+                                        nullptr, "partial.reduce");
 
   State.set(this, V);
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
new file mode 100644
index 0000000000000..c055940768dfa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -0,0 +1,91 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; SVE2-LABEL: fdot_wide_nxv4f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z3.s, z1.h
+; SVE2-NEXT:    uunpklo z4.s, z2.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    uunpkhi z2.s, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fcvt z4.s, p0/m, z4.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fmul z3.s, z3.s, z4.s
+; SVE2-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z3.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_nxv4f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
+  %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 4 x float> @fdot_splat_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
+; SVE2-LABEL: fdot_splat_nxv4f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z2.s, z1.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_splat_nxv4f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fmov z2.h, #1.00000000
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 8 x half> @partial_reduce_nxv8f16(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
+; CHECK-LABEL: partial_reduce_nxv8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  ret <vscale x 8 x half> %partial.reduce
+}
+
+define <vscale x 4 x float> @partial_reduce_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
+; CHECK-LABEL: partial_reduce_nxv4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 2 x double> @partial_reduce_nxv2f64(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
+; CHECK-LABEL: partial_reduce_nxv2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
+; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  ret <vscale x 2 x double> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
new file mode 100644
index 0000000000000..b07b571413881
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) {
+; SVE2-LABEL: fdot_wide_v8f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl8
+; SVE2-NEXT:    mov x8, #8 // =0x8
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v8f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl8
+; SVE2P1-NEXT:    ptrue p1.h, vl16
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <8 x float>, ptr %accptr
+  %a = load <16 x half>, ptr %aptr
+  %b = load <16 x half>, ptr %bptr
+  %a.wide = fpext <16 x half> %a to <16 x float>
+  %b.wide = fpext <16 x half> %b to <16 x float>
+  %mult = fmul <16 x float> %a.wide, %b.wide
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  store <8 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v16f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(4,0) {
+; SVE2-LABEL: fdot_wide_v16f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl16
+; SVE2-NEXT:    mov x8, #16 // =0x10
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v16f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl16
+; SVE2P1-NEXT:    ptrue p1.h, vl32
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <16 x float>, ptr %accptr
+  %a = load <32 x half>, ptr %aptr
+  %b = load <32 x half>, ptr %bptr
+  %a.wide = fpext <32 x half> %a to <32 x float>
+  %b.wide = fpext <32 x half> %b to <32 x float>
+  %mult = fmul <32 x float> %a.wide, %b.wide
+  %partial.reduce = call <16 x float> @llvm.vector.partial.reduce.fadd(<16 x float> %acc, <32 x float> %mult)
+  store <16 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v32f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(8,0) {
+; SVE2-LABEL: fdot_wide_v32f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl32
+; SVE2-NEXT:    mov x8, #32 // =0x20
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v32f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl32
+; SVE2P1-NEXT:    ptrue p1.h, vl64
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <32 x float>, ptr %accptr
+  %a = load <64 x half>, ptr %aptr
+  %b = load <64 x half>, ptr %bptr
+  %a.wide = fpext <64 x half> %a to <64 x float>
+  %b.wide = fpext <64 x half> %b to <64 x float>
+  %mult = fmul <64 x float> %a.wide, %b.wide
+  %partial.reduce = call <32 x float> @llvm.vector.partial.reduce.fadd(<32 x float> %acc, <64 x float> %mult)
+  store <32 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v64f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(16,0) {
+; SVE2-LABEL: fdot_wide_v64f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl64
+; SVE2-NEXT:    mov x8, #64 // =0x40
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v64f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl64
+; SVE2P1-NEXT:    ptrue p1.h, vl128
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <64 x float>, ptr %accptr
+  %a = load <128 x half>, ptr %aptr
+  %b = load <128 x half>, ptr %bptr
+  %a.wide = fpext <128 x half> %a to <128 x float>
+  %b.wide = fpext <128 x half> %b to <128 x float>
+  %mult = fmul <128 x float> %a.wide, %b.wide
+  %partial.reduce = call <64 x float> @llvm.vector.partial.reduce.fadd(<64 x float> %acc, <128 x float> %mult)
+  store <64 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: fixed_fdot_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <8 x half> %a to <8 x float>
+  %b.wide = fpext <8 x half> %b to <8 x float>
+  %mult = fmul <8 x float> %a.wide, %b.wide
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  ret <4 x float> %partial.reduce
+}
+
+define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  ret <8 x half> %partial.reduce
+}
+
+define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  ret <4 x float> %partial.reduce
+}
+
+define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  ret <2 x double> %partial.reduce
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll
new file mode 100644
index 0000000000000..f748ec2744ee8
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll
@@ -0,0 +1,1101 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define float @fdotp(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define float @fdotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = fmul <8 x float> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    ret float [[TMP7]]
+;
+; CHECK-INTERLEAVED-LABEL: define float @fdotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[TMP0]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x half>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x half>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD3]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = fpext <8 x half> [[WIDE_LOAD4]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = fmul <8 x float> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = fmul <8 x float> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI1]], <8 x float> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    ret float [[TMP11]]
+;
+; CHECK-MAXBW-LABEL: define float @fdotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP0]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x float> [[TMP7]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x float> [[VEC_PHI]], <vscale x 8 x float> [[TMP8]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 1
+  %ext.a = fpext half %load.a to float
+  %gep.b = getelementptr half, ptr %b, i64 %iv
+  %load.b = load half, ptr %gep.b, align 1
+  %ext.b = fpext half %load.b to float
+  %mul = fmul float %ext.b, %ext.a
+  %add = fadd reassoc float %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret float %add
+}
+
+define double @not_fdotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define double @not_fdotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double>
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = fpext <8 x float> [[WIDE_LOAD1]] to <8 x double>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = fmul <8 x double> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = fadd reassoc <8 x double> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    ret double [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define double @not_fdotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[GEP_A]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[GEP_B]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP5]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = fpext <8 x float> [[WIDE_LOAD3]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = fpext <8 x float> [[WIDE_LOAD4]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = fmul <8 x double> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = fmul <8 x double> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = fadd reassoc <8 x double> [[TMP8]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = fadd reassoc <8 x double> [[TMP9]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <8 x double> [[TMP11]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    ret double [[ADD_LCSSA]]
+;
+; CHECK-MAXBW-LABEL: define double @not_fdotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x double> [ insertelement (<vscale x 8 x double> splat (double -0.000000e+00), double 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x double>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x float> [[WIDE_LOAD1]] to <vscale x 8 x double>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x double> [[TMP7]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP9]] = fadd reassoc <vscale x 8 x double> [[TMP8]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = call reassoc double @llvm.vector.reduce.fadd.nxv8f64(double -0.000000e+00, <vscale x 8 x double> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to double
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load float, ptr [[GEP_B]], align 2
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext float [[LOAD_B]] to double
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul double [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc double [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi double [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    ret double [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi double [ 0.0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 1
+  %ext.a = fpext half %load.a to double
+  %gep.b = getelementptr float, ptr %b, i64 %iv
+  %load.b = load float, ptr %gep.b, align 2
+  %ext.b = fpext float %load.b to double
+  %mul = fmul double %ext.b, %ext.a
+  %add = fadd reassoc double %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret double %add
+}
+
+define float @not_fdotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x float> [[TMP7]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = fadd reassoc <vscale x 8 x float> [[TMP8]], [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x float> [[TMP9]], i32 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret float [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = fmul <vscale x 8 x float> [[TMP13]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = fadd reassoc <vscale x 8 x float> [[TMP14]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x float> [[TMP15]], i32 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[FOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    ret float [[ADD_LCSSA]]
+;
+; CHECK-MAXBW-LABEL: define float @not_fdotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP6]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x float> [[TMP7]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = fadd reassoc <vscale x 8 x float> [[TMP8]], [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x float> [[TMP9]], i32 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 1
+  %ext.a = fpext half %load.a to float
+  %gep.b = getelementptr half, ptr %b, i64 %iv
+  %load.b = load half, ptr %gep.b, align 1
+  %ext.b = fpext half %load.b to float
+  %mul = fmul float %ext.b, %ext.a
+  %add = fadd reassoc float %mul, %ext.b
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret float %add
+}
+
+define void @not_fdotp_not_phi2(ptr %matrix, i32 %n) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_fdotp_not_phi2(
+; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_PREHEADER]]:
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load half, ptr null, align 1
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-INTERLEAVE1-NEXT:    [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]]
+; CHECK-INTERLEAVE1-NEXT:    [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVE1:       [[EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ]
+; CHECK-INTERLEAVE1-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
+; CHECK-INTERLEAVE1-NEXT:    ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_fdotp_not_phi2(
+; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_PREHEADER]]:
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load half, ptr null, align 1
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-INTERLEAVED-NEXT:    [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVED-NEXT:    [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP70:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP71:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 128
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 160
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 192
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 224
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = load half, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = load half, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = load half, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = load half, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = insertelement <4 x half> poison, half [[TMP28]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = insertelement <4 x half> [[TMP34]], half [[TMP31]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = load half, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = load half, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = load half, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = load half, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = insertelement <4 x half> poison, half [[TMP36]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = insertelement <4 x half> [[TMP40]], half [[TMP37]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = insertelement <4 x half> [[TMP41]], half [[TMP38]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = insertelement <4 x half> [[TMP42]], half [[TMP39]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = fpext <4 x half> [[TMP35]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = fpext <4 x half> [[TMP43]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP45]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = fadd reassoc <4 x float> [[TMP46]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = fadd reassoc <4 x float> [[TMP47]], [[VEC_PHI3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = load half, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = load half, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = load half, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = load half, ptr [[TMP23]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = insertelement <4 x half> poison, half [[TMP50]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = insertelement <4 x half> [[TMP54]], half [[TMP51]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = insertelement <4 x half> [[TMP55]], half [[TMP52]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = insertelement <4 x half> [[TMP56]], half [[TMP53]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = load half, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = load half, ptr [[TMP25]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = load half, ptr [[TMP26]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = load half, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = insertelement <4 x half> poison, half [[TMP58]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = insertelement <4 x half> [[TMP62]], half [[TMP59]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = insertelement <4 x half> [[TMP63]], half [[TMP60]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = insertelement <4 x half> [[TMP64]], half [[TMP61]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = fpext <4 x half> [[TMP57]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = fpext <4 x half> [[TMP65]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP66]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP67]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP70]] = fadd reassoc <4 x float> [[TMP68]], [[TMP48]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP71]] = fadd reassoc <4 x float> [[TMP69]], [[TMP49]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP71]], [[TMP70]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP73]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[FOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL11]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]]
+; CHECK-INTERLEAVED-NEXT:    [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP73]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVED:       [[EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ]
+; CHECK-INTERLEAVED-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
+; CHECK-INTERLEAVED-NEXT:    ret void
+;
+; CHECK-MAXBW-LABEL: define void @not_fdotp_not_phi2(
+; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-MAXBW:       [[FOR_PREHEADER]]:
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr null, align 1
+; CHECK-MAXBW-NEXT:    [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-MAXBW-NEXT:    [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float>
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]]
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float>
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]]
+; CHECK-MAXBW-NEXT:    [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1
+; CHECK-MAXBW-NEXT:    [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]]
+; CHECK-MAXBW-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1
+; CHECK-MAXBW-NEXT:    [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float
+; CHECK-MAXBW-NEXT:    [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]]
+; CHECK-MAXBW-NEXT:    [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]]
+; CHECK-MAXBW-NEXT:    [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    br label %[[EXIT]]
+; CHECK-MAXBW:       [[EXIT]]:
+; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ]
+; CHECK-MAXBW-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
+; CHECK-MAXBW-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.preheader, label %exit
+
+for.preheader:                   ; preds = %entry
+  %load.a = load half, ptr inttoptr (i64 0 to ptr), align 1
+  %load.a1 = load half, ptr inttoptr (i64 1 to ptr), align 1
+  %a.ext = fpext half %load.a to float
+  %a.ext1 = fpext half %load.a1 to float
+  br label %for.body
+
+for.body:                             ; preds = %for.preheader, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ]
+  %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ]
+  %accum = phi float [ %add.1, %for.body ], [ 0.0, %for.preheader ]
+  %gep.b = getelementptr half, ptr %ptr, i64 1
+  %gep.b1 = getelementptr half, ptr %ptr, i64 2
+  %load.b = load half, ptr %gep.b, align 1
+  %b.ext = fpext half %load.b to float
+  %mul = fmul float %a.ext, %b.ext
+  %add = fadd reassoc float %mul, %accum
+  %load.b1 = load half, ptr %gep.b1, align 1
+  %b.ext1 = fpext half %load.b1 to float
+  %mul.1 = fmul float %a.ext1, %b.ext1
+  %add.1 = fadd reassoc float %mul.1, %add
+  %scevgep = getelementptr half, ptr %ptr, i64 16
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %n
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                       ; preds = %for.body
+  %add.float = phi float [ %add.1, %for.body ]
+  br label %exit
+
+exit:                                ; preds = %for.exit, %entry
+  %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ]
+  store float %result, ptr %matrix, align 4
+  ret void
+}
+
+define float @not_fdotp_ext_outside_plan(ptr %a, half %b, i64 %n) #0 {
+; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_ext_outside_plan(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = fpext half [[B]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = fmul <8 x float> [[TMP1]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3]] = fadd reassoc <8 x float> [[TMP2]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[EXIT_LOOPEXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVE1:       [[EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret float [[RESULT]]
+;
+; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_ext_outside_plan(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[FOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = fpext half [[B]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds half, ptr [[TMP0]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = fmul <8 x float> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = fmul <8 x float> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = fadd reassoc <8 x float> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = fadd reassoc <8 x float> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <8 x float> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[FOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVED:       [[EXIT_LOOPEXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVED:       [[EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-INTERLEAVED-NEXT:    ret float [[RESULT]]
+;
+; CHECK-MAXBW-LABEL: define float @not_fdotp_ext_outside_plan(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]]
+; CHECK-MAXBW:       [[FOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext half [[B]] to float
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x float> poison, float [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x float> [[BROADCAST_SPLATINSERT]], <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = fmul <vscale x 8 x float> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP7]] = fadd reassoc <vscale x 8 x float> [[TMP6]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP7]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW:       [[EXIT_LOOPEXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    br label %[[EXIT]]
+; CHECK-MAXBW:       [[EXIT]]:
+; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-MAXBW-NEXT:    ret float [[RESULT]]
+;
+entry:
+  %cmp = icmp eq i64 %n, 0
+  br i1 %cmp, label %exit, label %for.ph
+
+for.ph:                                   ; preds = %entry
+  %ext.b = fpext half %b to float
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ]
+  %accum = phi float [ 0.0, %for.ph ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 2
+  %ext.a = fpext half %load.a to float
+  %mul = fmul float %ext.a, %ext.b
+  %add = fadd reassoc float %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp.1 = icmp eq i64 %iv.next, %n
+  br i1 %cmp.1, label %exit, label %for.body
+
+exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %result = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  ret float %result
+}
+
+
+;.
+; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
+; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+;.
+; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+;.