From ab6662ca4a696211664b2b7511bd93b7c0ae48b7 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 19 Sep 2025 10:51:26 +0000
Subject: [PATCH 01/15] Add `llvm.vector.partial.reduction.fadd` intrinsic

With this intrinsic, and supporting SelectionDAG nodes, we can better make use of instructions such as AArch64's `FDOT`.
---
 llvm/docs/LangRef.rst                         | 42 ++++++++++++
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |  3 +-
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 +-
 llvm/include/llvm/IR/Intrinsics.td            |  4 ++
 .../include/llvm/Target/TargetSelectionDAG.td |  2 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 24 +++++--
 .../SelectionDAG/LegalizeVectorOps.cpp        |  2 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  2 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  3 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 13 ++++
 .../SelectionDAG/SelectionDAGDumper.cpp       |  2 +
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 21 +++---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  6 ++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  3 +
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 66 +++++++++++++++++++
 15 files changed, 178 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 033910121a54f..c323872d89724 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20780,6 +20780,48 @@ performance, and an out-of-loop phase to calculate the final scalar result.
 By avoiding the introduction of new ordering constraints, these intrinsics
 enhance the ability to leverage a target's accumulation instructions.
 
+'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.add.nxv4f32.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.partial.reduce.fadd.*``' intrinsics reduce the
+concatenation of the two vector arguments down to the number of elements of the
+result vector type.
+
+Arguments:
+""""""""""
+
+The first argument is a floating-point vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+Semantics:
+""""""""""
+
+Other than the reduction operator (e.g. add) the way in which the concatenated
+arguments is reduced is entirely unspecified. By their nature these intrinsics
+are not expected to be useful in isolation but instead implement the first phase
+of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase to calculate the final scalar result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index ff3dd0d4c3c51..d5bb8e780d121 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1516,6 +1516,7 @@ enum NodeType {
   PARTIAL_REDUCE_SMLA,  // sext, sext
   PARTIAL_REDUCE_UMLA,  // zext, zext
   PARTIAL_REDUCE_SUMLA, // sext, zext
+  PARTIAL_REDUCE_FMLA,  // fpext, fpext
 
   // The `llvm.experimental.stackmap` intrinsic.
   // Operands: input chain, glue, <id>, <numShadowBytes>, [live0[, live1...]]
@@ -1767,7 +1768,7 @@ LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type);
 
 inline bool isExtOpcode(unsigned Opcode) {
   return Opcode == ISD::ANY_EXTEND || Opcode == ISD::ZERO_EXTEND ||
-         Opcode == ISD::SIGN_EXTEND;
+         Opcode == ISD::SIGN_EXTEND || Opcode == ISD::FP_EXTEND;
 }
 
 inline bool isExtVecInRegOpcode(unsigned Opcode) {
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 73f2c55a71125..92405bebbca46 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1664,7 +1664,7 @@ class LLVM_ABI TargetLoweringBase {
   LegalizeAction getPartialReduceMLAAction(unsigned Opc, EVT AccVT,
                                            EVT InputVT) const {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     PartialReduceActionTypes Key = {Opc, AccVT.getSimpleVT().SimpleTy,
                                     InputVT.getSimpleVT().SimpleTy};
     auto It = PartialReduceMLAActions.find(Key);
@@ -2766,7 +2766,7 @@ class LLVM_ABI TargetLoweringBase {
   void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT,
                                  LegalizeAction Action) {
     assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA ||
-           Opc == ISD::PARTIAL_REDUCE_SUMLA);
+           Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA);
     assert(AccVT.isValid() && InputVT.isValid() &&
            "setPartialReduceMLAAction types aren't valid");
     PartialReduceActionTypes Key = {Opc, AccVT.SimpleTy, InputVT.SimpleTy};
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 12d1c2528f977..0cb01fb484e88 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2799,6 +2799,10 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
                                                           [IntrNoMem,
                                                            IntrSpeculatable]>;
 
+def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
+                                                                        [llvm_anyfloat_ty, llvm_anyfloat_ty],
+                                                                        [IntrNoMem]>;
+
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
 
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 07a858fd682fc..a9750a5ab03f9 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -527,6 +527,8 @@ def partial_reduce_smla : SDNode<"ISD::PARTIAL_REDUCE_SMLA",
                                  SDTPartialReduceMLA>;
 def partial_reduce_sumla : SDNode<"ISD::PARTIAL_REDUCE_SUMLA",
                                  SDTPartialReduceMLA>;
+def partial_reduce_fmla : SDNode<"ISD::PARTIAL_REDUCE_FMLA",
+                                 SDTPartialReduceMLA>;
 
 def fadd       : SDNode<"ISD::FADD"       , SDTFPBinOp, [SDNPCommutative]>;
 def fsub       : SDNode<"ISD::FSUB"       , SDTFPBinOp>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 310d35d9b1d1e..d88fb7dcda3ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
                                 return visitPARTIAL_REDUCE_MLA(N);
   case ISD::VECTOR_COMPRESS:    return visitVECTOR_COMPRESS(N);
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
@@ -12996,7 +12997,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDValue Op2 = N->getOperand(2);
 
   unsigned Opc = Op1->getOpcode();
-  if (Opc != ISD::MUL && Opc != ISD::SHL)
+  if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL)
     return SDValue();
 
   SDValue LHS = Op1->getOperand(0);
@@ -13016,8 +13017,11 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   APInt C;
-  if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) ||
-      !C.isOne())
+  if (!(Op1->getOpcode() == ISD::MUL &&
+        ISD::isConstantSplatVector(Op2.getNode(), C) && C.isOne()) &&
+      !(Op1->getOpcode() == ISD::FMUL &&
+        ISD::isConstantSplatVector(Op2.getNode(), C) &&
+        C == APFloat(1.0f).bitcastToAPInt().trunc(C.getBitWidth())))
     return SDValue();
 
   unsigned LHSOpcode = LHS->getOpcode();
@@ -13070,6 +13074,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) {
     NewOpc = ISD::PARTIAL_REDUCE_SUMLA;
     std::swap(LHSExtOp, RHSExtOp);
+  } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) {
+    NewOpc = ISD::PARTIAL_REDUCE_FMLA;
   } else
     return SDValue();
   // For a 2-stage extend the signedness of both of the extends must match
@@ -13105,22 +13111,26 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
 
   APInt ConstantOne;
   if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+      !(ConstantOne.isOne() ||
+        ConstantOne ==
+            APFloat(1.0f).bitcastToAPInt().trunc(ConstantOne.getBitWidth())))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
   if (!ISD::isExtOpcode(Op1Opcode))
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
+  bool Op1IsSigned = Op1Opcode != ISD::ZERO_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
   if (Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
-  unsigned NewOpcode =
-      Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA;
+  unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                           ? ISD::PARTIAL_REDUCE_FMLA
+                       : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA
+                                     : ISD::PARTIAL_REDUCE_UMLA;
 
   SDValue UnextOp1 = Op1.getOperand(0);
   EVT UnextOp1VT = UnextOp1.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 8e423c4f83b38..94751be5b7986 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Action =
         TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0),
                                       Node->getOperand(1).getValueType());
@@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Results.push_back(TLI.expandPartialReduceMLA(Node, DAG));
     return;
   case ISD::VECREDUCE_SEQ_FADD:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 3b5f83f7c089a..4b40b32621418 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1459,6 +1459,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi);
     break;
   case ISD::GET_ACTIVE_LANE_MASK:
@@ -3674,6 +3675,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     Res = SplitVecOp_PARTIAL_REDUCE_MLA(N);
     break;
   }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 90edaf3ef5471..0769fa1072888 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8404,7 +8404,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   }
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SMLA:
-  case ISD::PARTIAL_REDUCE_SUMLA: {
+  case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA: {
     [[maybe_unused]] EVT AccVT = N1.getValueType();
     [[maybe_unused]] EVT Input1VT = N2.getValueType();
     [[maybe_unused]] EVT Input2VT = N3.getValueType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 20a0efd3afa1c..eb5fe4f5328c6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8081,6 +8081,19 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                          Input, DAG.getConstant(1, sdl, Input.getValueType())));
     return;
   }
+  case Intrinsic::vector_partial_reduce_fadd: {
+    if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+    SDValue Acc = getValue(I.getOperand(0));
+    SDValue Input = getValue(I.getOperand(1));
+    setValue(&I,
+             DAG.getNode(ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
+                         Input,
+                         DAG.getConstantFP(1.0f, sdl, Input.getValueType())));
+    return;
+  }
   case Intrinsic::experimental_cttz_elts: {
     auto DL = getCurSDLoc();
     SDValue Op = getValue(I.getOperand(0));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 39cbfad6d0be1..8495e9691c77b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -588,6 +588,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     return "partial_reduce_smla";
   case ISD::PARTIAL_REDUCE_SUMLA:
     return "partial_reduce_sumla";
+  case ISD::PARTIAL_REDUCE_FMLA:
+    return "partial_reduce_fmla";
   case ISD::LOOP_DEPENDENCE_WAR_MASK:
     return "loop_dep_war";
   case ISD::LOOP_DEPENDENCE_RAW_MASK:
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 920dff935daed..e27ae1619f570 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12061,12 +12061,14 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
       EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
                        MulOpVT.getVectorElementCount());
 
-  unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA
-                      ? ISD::ZERO_EXTEND
-                      : ISD::SIGN_EXTEND;
-  unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA
-                      ? ISD::SIGN_EXTEND
-                      : ISD::ZERO_EXTEND;
+  unsigned ExtOpcLHS =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
+      : N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA ? ISD::ZERO_EXTEND
+                                                   : ISD::SIGN_EXTEND;
+  unsigned ExtOpcRHS =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
+      : N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA ? ISD::SIGN_EXTEND
+                                                   : ISD::ZERO_EXTEND;
 
   if (ExtMulOpVT != MulOpVT) {
     MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
@@ -12075,7 +12077,7 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   SDValue Input = MulLHS;
   APInt ConstantOne;
   if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) ||
-      !ConstantOne.isOne())
+      !(ConstantOne.isOne() || ConstantOne == APFloat(1.0f).bitcastToAPInt()))
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
 
   unsigned Stride = AccVT.getVectorMinNumElements();
@@ -12086,10 +12088,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   for (unsigned I = 0; I < ScaleFactor; I++)
     Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride));
 
+  unsigned FlatNode =
+      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD;
+
   // Flatten the subvector tree
   while (Subvectors.size() > 1) {
     Subvectors.push_back(
-        DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]}));
+        DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]}));
     Subvectors.pop_front();
     Subvectors.pop_front();
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a81de5c5adc34..e719863c3ad99 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1923,6 +1923,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     }
   }
 
+  // Handle floating-point partial reduction
+  if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
+    static const unsigned FMLAOps[] = {ISD::PARTIAL_REDUCE_FMLA};
+    setPartialReduceMLAAction(FMLAOps, MVT::nxv4f32, MVT::nxv8f16, Legal);
+  }
+
   // Handle non-aliasing elements mask
   if (Subtarget->hasSVE2() ||
       (Subtarget->hasSME() && Subtarget->isStreaming())) {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 98a128e582866..8522bd237a242 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4254,6 +4254,9 @@ defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
+def : Pat<(nxv4f32 (partial_reduce_fmla nxv4f32:$Acc, nxv8f16:$LHS, nxv8f16:$RHS)),
+          (FDOT_ZZZ_S $Acc, $LHS, $RHS)>;
+
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
 defm BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslt>;
 defm BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb_lane>;
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
new file mode 100644
index 0000000000000..5bb1fae43392f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+
+define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: fdot_wide_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
+  %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
+; CHECK-LABEL: fdot_wide_vl256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
+; CHECK-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
+; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
+; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
+; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
+; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
+; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
+; CHECK-NEXT:    ldr z1, [x0]
+; CHECK-NEXT:    fmul z2.s, z2.s, z3.s
+; CHECK-NEXT:    fadd z0.s, z1.s, z0.s
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    str z0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %acc = load <8 x float>, ptr %accptr
+  %a = load <16 x half>, ptr %aptr
+  %b = load <16 x half>, ptr %bptr
+  %a.wide = fpext <16 x half> %a to <16 x float>
+  %b.wide = fpext <16 x half> %b to <16 x float>
+  %mult = fmul <16 x float> %a.wide, %b.wide
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  store <8 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: fixed_fdot_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <8 x half> %a to <8 x float>
+  %b.wide = fpext <8 x half> %b to <8 x float>
+  %mult = fmul <8 x float> %a.wide, %b.wide
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  ret <4 x float> %partial.reduce
+}

From e92bba76ef82203c352a5d6e8f653a53017b270e Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 19 Sep 2025 16:00:49 +0000
Subject: [PATCH 02/15] Revert adding `FP_EXTEND` to `isExtOpcode`

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h        | 2 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index d5bb8e780d121..1a3fd27e64c4f 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -1768,7 +1768,7 @@ LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type);
 
 inline bool isExtOpcode(unsigned Opcode) {
   return Opcode == ISD::ANY_EXTEND || Opcode == ISD::ZERO_EXTEND ||
-         Opcode == ISD::SIGN_EXTEND || Opcode == ISD::FP_EXTEND;
+         Opcode == ISD::SIGN_EXTEND;
 }
 
 inline bool isExtVecInRegOpcode(unsigned Opcode) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d88fb7dcda3ad..d364768e7628c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13025,7 +13025,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
     return SDValue();
 
   unsigned LHSOpcode = LHS->getOpcode();
-  if (!ISD::isExtOpcode(LHSOpcode))
+  if (!ISD::isExtOpcode(LHSOpcode) && LHSOpcode != ISD::FP_EXTEND)
     return SDValue();
 
   SDValue LHSExtOp = LHS->getOperand(0);
@@ -13057,7 +13057,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   unsigned RHSOpcode = RHS->getOpcode();
-  if (!ISD::isExtOpcode(RHSOpcode))
+  if (!ISD::isExtOpcode(RHSOpcode) && RHSOpcode != ISD::FP_EXTEND)
     return SDValue();
 
   SDValue RHSExtOp = RHS->getOperand(0);

From 92ee5a2256338070bc8a934bffff2136007c4a5f Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Mon, 22 Sep 2025 10:15:46 +0000
Subject: [PATCH 03/15] Address review comments

Corrected LangRef typos, improved const
comparisons for fadd, and add direct tests.
---
 llvm/docs/LangRef.rst                         |  6 +-
 llvm/include/llvm/IR/Intrinsics.td            |  4 +-
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 20 +++---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  8 ++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  3 -
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  1 +
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 66 +++++++++++++++++++
 7 files changed, 90 insertions(+), 18 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c323872d89724..7254e8faadeb5 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20789,8 +20789,8 @@ This is an overloaded intrinsic.
 
 ::
 
-      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
-      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.add.nxv4f32.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
 
 Overview:
 """""""""
@@ -20810,7 +20810,7 @@ of the result's type, while maintaining the same element type.
 Semantics:
 """"""""""
 
-Other than the reduction operator (e.g. add) the way in which the concatenated
+Other than the reduction operator (e.g. fadd) the way in which the concatenated
 arguments is reduced is entirely unspecified. By their nature these intrinsics
 are not expected to be useful in isolation but instead implement the first phase
 of an overall reduction operation.
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 0cb01fb484e88..5163d2052561e 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2800,8 +2800,8 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
                                                            IntrSpeculatable]>;
 
 def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
-                                                                        [llvm_anyfloat_ty, llvm_anyfloat_ty],
-                                                                        [IntrNoMem]>;
+                                                           [llvm_anyfloat_ty, llvm_anyfloat_ty],
+                                                           [IntrNoMem]>;
 
 //===----------------- Pointer Authentication Intrinsics ------------------===//
 //
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d364768e7628c..dda05ddb4e53e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13017,11 +13017,12 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   APInt C;
+  ConstantFPSDNode *CFP;
   if (!(Op1->getOpcode() == ISD::MUL &&
         ISD::isConstantSplatVector(Op2.getNode(), C) && C.isOne()) &&
       !(Op1->getOpcode() == ISD::FMUL &&
-        ISD::isConstantSplatVector(Op2.getNode(), C) &&
-        C == APFloat(1.0f).bitcastToAPInt().trunc(C.getBitWidth())))
+        (CFP = llvm::isConstOrConstSplatFP(Op2, false)) &&
+        CFP->isExactlyValue(1.0)))
     return SDValue();
 
   unsigned LHSOpcode = LHS->getOpcode();
@@ -13110,20 +13111,23 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDValue Op2 = N->getOperand(2);
 
   APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) ||
-      !(ConstantOne.isOne() ||
-        ConstantOne ==
-            APFloat(1.0f).bitcastToAPInt().trunc(ConstantOne.getBitWidth())))
+  ConstantFPSDNode *C;
+  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
+        (C = llvm::isConstOrConstSplatFP(Op2, false)) &&
+        C->isExactlyValue(1.0)) &&
+      !(ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) &&
+        ConstantOne.isOne()))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
   if (!ISD::isExtOpcode(Op1Opcode))
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode != ISD::ZERO_EXTEND;
+  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
-  if (Op1IsSigned != NodeIsSigned &&
+  if (N->getOpcode() != ISD::PARTIAL_REDUCE_FMLA &&
+      Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index e27ae1619f570..3e5e41555c43b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12076,8 +12076,12 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   }
   SDValue Input = MulLHS;
   APInt ConstantOne;
-  if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) ||
-      !(ConstantOne.isOne() || ConstantOne == APFloat(1.0f).bitcastToAPInt()))
+  ConstantFPSDNode *C;
+  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
+        (C = llvm::isConstOrConstSplatFP(MulRHS, false)) &&
+        C->isExactlyValue(1.0)) &&
+      !(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
+        ConstantOne.isOne()))
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
 
   unsigned Stride = AccVT.getVectorMinNumElements();
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 8522bd237a242..98a128e582866 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4254,9 +4254,6 @@ defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
-def : Pat<(nxv4f32 (partial_reduce_fmla nxv4f32:$Acc, nxv8f16:$LHS, nxv8f16:$RHS)),
-          (FDOT_ZZZ_S $Acc, $LHS, $RHS)>;
-
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
 defm BFMLSLT_ZZZ_S : sve2_fp_mla_long<0b111, "bfmlslt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslt>;
 defm BFMLSLB_ZZZI_S : sve2_fp_mla_long_by_indexed_elem<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb_lane>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 3cdd505f12116..e60a8632d2e8b 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9457,6 +9457,7 @@ multiclass sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty,
                          string asm, ValueType InVT, SDPatternOperator op> {
   def NAME : sve_float_dot<bf, o2, dst_ty, src_ty, asm>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv4f32, partial_reduce_fmla, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
 }
 
 multiclass sve_fp8_dot<bit bf, ZPRRegOp dstrc, string asm, ValueType vt,
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 5bb1fae43392f..69c0b68f23f78 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -64,3 +64,69 @@ entry:
   %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
   ret <4 x float> %partial.reduce
 }
+
+define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  ret <8 x half> %partial.reduce
+}
+
+define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  ret <4 x float> %partial.reduce
+}
+
+define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  ret <2 x double> %partial.reduce
+}
+
+define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
+; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  ret <vscale x 8 x half> %partial.reduce
+}
+
+define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
+; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
+; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  ret <vscale x 2 x double> %partial.reduce
+}

From f4542a90b73a7ada67c08fb6fa71eb8f5ce65292 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Tue, 23 Sep 2025 09:13:39 +0000
Subject: [PATCH 04/15] Require reassoc

---
 llvm/lib/IR/Verifier.cpp                 |  6 ++++++
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll | 18 +++++++++---------
 2 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 03da1547b652f..65975868388aa 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6578,6 +6578,12 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::vector_partial_reduce_fadd: {
+    Check(Call.hasAllowReassoc(),
+          "vector_partial_reduce_fadd requires reassociation to be allowed.");
+    // Fall through to perform the same verification checks as for integers.
+    [[fallthrough]];
+  }
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 69c0b68f23f78..aa2184ab6e65e 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -10,7 +10,7 @@ entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
   %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
-  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -40,7 +40,7 @@ entry:
   %a.wide = fpext <16 x half> %a to <16 x float>
   %b.wide = fpext <16 x half> %b to <16 x float>
   %mult = fmul <16 x float> %a.wide, %b.wide
-  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  %partial.reduce = call reassoc <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
   store <8 x float> %partial.reduce, ptr %accptr
   ret void
 }
@@ -61,7 +61,7 @@ entry:
   %a.wide = fpext <8 x half> %a to <8 x float>
   %b.wide = fpext <8 x half> %b to <8 x float>
   %mult = fmul <8 x float> %a.wide, %b.wide
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
   ret <4 x float> %partial.reduce
 }
 
@@ -72,7 +72,7 @@ define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
 ; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  %partial.reduce = call reassoc <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
   ret <8 x half> %partial.reduce
 }
 
@@ -83,7 +83,7 @@ define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
   ret <4 x float> %partial.reduce
 }
 
@@ -94,7 +94,7 @@ define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
 ; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  %partial.reduce = call reassoc <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
   ret <2 x double> %partial.reduce
 }
 
@@ -105,7 +105,7 @@ define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc,
 ; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  %partial.reduce = call reassoc <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
   ret <vscale x 8 x half> %partial.reduce
 }
 
@@ -116,7 +116,7 @@ define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %ac
 ; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -127,6 +127,6 @@ define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double>
 ; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  %partial.reduce = call reassoc <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
   ret <vscale x 2 x double> %partial.reduce
 }

From 6295b47d8246a1995e6573612a84844ec561a797 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 26 Sep 2025 09:55:22 +0000
Subject: [PATCH 05/15] Revert "Require reassoc"

This reverts commit 319852132602f685aea6228f10418370fd530aa7.
---
 llvm/lib/IR/Verifier.cpp                 |  6 ------
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll | 18 +++++++++---------
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 65975868388aa..03da1547b652f 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6578,12 +6578,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
-  case Intrinsic::vector_partial_reduce_fadd: {
-    Check(Call.hasAllowReassoc(),
-          "vector_partial_reduce_fadd requires reassociation to be allowed.");
-    // Fall through to perform the same verification checks as for integers.
-    [[fallthrough]];
-  }
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index aa2184ab6e65e..69c0b68f23f78 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -10,7 +10,7 @@ entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
   %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
-  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -40,7 +40,7 @@ entry:
   %a.wide = fpext <16 x half> %a to <16 x float>
   %b.wide = fpext <16 x half> %b to <16 x float>
   %mult = fmul <16 x float> %a.wide, %b.wide
-  %partial.reduce = call reassoc <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
   store <8 x float> %partial.reduce, ptr %accptr
   ret void
 }
@@ -61,7 +61,7 @@ entry:
   %a.wide = fpext <8 x half> %a to <8 x float>
   %b.wide = fpext <8 x half> %b to <8 x float>
   %mult = fmul <8 x float> %a.wide, %b.wide
-  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
   ret <4 x float> %partial.reduce
 }
 
@@ -72,7 +72,7 @@ define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
 ; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
   ret <8 x half> %partial.reduce
 }
 
@@ -83,7 +83,7 @@ define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
   ret <4 x float> %partial.reduce
 }
 
@@ -94,7 +94,7 @@ define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
 ; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
   ret <2 x double> %partial.reduce
 }
 
@@ -105,7 +105,7 @@ define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc,
 ; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+  %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
   ret <vscale x 8 x half> %partial.reduce
 }
 
@@ -116,7 +116,7 @@ define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %ac
 ; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
   ret <vscale x 4 x float> %partial.reduce
 }
 
@@ -127,6 +127,6 @@ define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double>
 ; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
 ; CHECK-NEXT:    ret
 entry:
-  %partial.reduce = call reassoc <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+  %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
   ret <vscale x 2 x double> %partial.reduce
 }

From cd36eae030c032232ce563d02e3d22f08e823576 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Wed, 8 Oct 2025 10:38:25 +0000
Subject: [PATCH 06/15] Re-add Verifier case, and consolidate partial reduction
 docs

---
 llvm/docs/LangRef.rst    | 64 +++++++++++++++-------------------------
 llvm/lib/IR/Verifier.cpp |  1 +
 2 files changed, 24 insertions(+), 41 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7254e8faadeb5..203e40c68b177 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20736,8 +20736,27 @@ Note that it has the following implications:
 -  If ``%cnt`` is non-zero, the return value is non-zero as well.
 -  If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
 
+Vector Partial Reduction Intrinsics
+-----------------------------------
+
+Partial horizontal reductions of vectors can be expressed using the following intrinsics.
+Each one reduces the concatenation of the two vector arguments down to the number of elements
+of the result vector type.
+
+Other than the reduction operator (e.g. add, fadd) the way in which the concatenated
+arguments is reduced is entirely unspecified. By their nature these intrinsics
+are not expected to be useful in isolation but instead implement the first phase
+of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase to calculate the final scalar result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
 '``llvm.vector.partial.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
 """""""
@@ -20750,13 +20769,6 @@ This is an overloaded intrinsic.
       declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
       declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
 
-Overview:
-"""""""""
-
-The '``llvm.vector.partial.reduce.add.*``' intrinsics reduce the
-concatenation of the two vector arguments down to the number of elements of the
-result vector type.
-
 Arguments:
 """"""""""
 
@@ -20765,21 +20777,6 @@ The first argument is an integer vector with the same type as the result.
 The second argument is a vector with a length that is a known integer multiple
 of the result's type, while maintaining the same element type.
 
-Semantics:
-""""""""""
-
-Other than the reduction operator (e.g. add) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
-
 '``llvm.vector.partial.reduce.fadd.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -20792,13 +20789,6 @@ This is an overloaded intrinsic.
       declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
       declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
 
-Overview:
-"""""""""
-
-The '``llvm.vector.partial.reduce.fadd.*``' intrinsics reduce the
-concatenation of the two vector arguments down to the number of elements of the
-result vector type.
-
 Arguments:
 """"""""""
 
@@ -20810,17 +20800,9 @@ of the result's type, while maintaining the same element type.
 Semantics:
 """"""""""
 
-Other than the reduction operator (e.g. fadd) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
+As the way in which the arguments to this floating-point intrinsic are reduced is unspecified,
+this intrinsic will reassociate floating-point values, which may result in variations to the
+results due to reordering or by lowering to different instructions.
 
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 03da1547b652f..7efba21238458 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -6578,6 +6578,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::vector_partial_reduce_fadd:
   case Intrinsic::vector_partial_reduce_add: {
     VectorType *AccTy = cast<VectorType>(Call.getArgOperand(0)->getType());
     VectorType *VecTy = cast<VectorType>(Call.getArgOperand(1)->getType());

From 63bd5215717f2748fd484d922b984b320abb28d4 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 10 Oct 2025 13:20:42 +0000
Subject: [PATCH 07/15] Address review comments

---
 llvm/docs/LangRef.rst                         | 138 +++++++++---------
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  24 ++-
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      |  12 ++
 3 files changed, 102 insertions(+), 72 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 203e40c68b177..9424594f5348f 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20363,6 +20363,76 @@ Arguments:
 """"""""""
 The argument to this intrinsic must be a vector of floating-point values.
 
+Vector Partial Reduction Intrinsics
+-----------------------------------
+
+Partial reductions of vectors can be expressed using the following intrinsics.
+Each one reduces the concatenation of the two vector arguments down to the
+number of elements of the result vector type.
+
+Other than the reduction operator (e.g. add, fadd) the way in which the
+concatenated arguments is reduced is entirely unspecified. By their nature these
+intrinsics are not expected to be useful in isolation but instead implement the
+first phase of an overall reduction operation.
+
+The typical use case is loop vectorization where reductions are split into an
+in-loop phase, where maintaining an unordered vector result is important for
+performance, and an out-of-loop phase to calculate the final scalar result.
+
+By avoiding the introduction of new ordering constraints, these intrinsics
+enhance the ability to leverage a target's accumulation instructions.
+
+'``llvm.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
+      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
+      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is an integer vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
+      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
+
+Arguments:
+""""""""""
+
+The first argument is a floating-point vector with the same type as the result.
+
+The second argument is a vector with a length that is a known integer multiple
+of the result's type, while maintaining the same element type.
+
+Semantics:
+""""""""""
+
+As the way in which the arguments to this floating-point intrinsic are reduced
+is unspecified, this intrinsic will assume floating-point reassociation and
+contraction, which may result in variations to the results due to reordering or
+by lowering to different instructions (including combining multiple instructions
+into a single one).
+
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -20736,74 +20806,6 @@ Note that it has the following implications:
 -  If ``%cnt`` is non-zero, the return value is non-zero as well.
 -  If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``.
 
-Vector Partial Reduction Intrinsics
------------------------------------
-
-Partial horizontal reductions of vectors can be expressed using the following intrinsics.
-Each one reduces the concatenation of the two vector arguments down to the number of elements
-of the result vector type.
-
-Other than the reduction operator (e.g. add, fadd) the way in which the concatenated
-arguments is reduced is entirely unspecified. By their nature these intrinsics
-are not expected to be useful in isolation but instead implement the first phase
-of an overall reduction operation.
-
-The typical use case is loop vectorization where reductions are split into an
-in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
-
-By avoiding the introduction of new ordering constraints, these intrinsics
-enhance the ability to leverage a target's accumulation instructions.
-
-'``llvm.vector.partial.reduce.add.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-This is an overloaded intrinsic.
-
-::
-
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b)
-      declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32(<vscale x 4 x i32> %a, <vscale x 8 x i32> %b)
-      declare <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32(<vscale x 4 x i32> %a, <vscale x 16 x i32> %b)
-
-Arguments:
-""""""""""
-
-The first argument is an integer vector with the same type as the result.
-
-The second argument is a vector with a length that is a known integer multiple
-of the result's type, while maintaining the same element type.
-
-'``llvm.vector.partial.reduce.fadd.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-This is an overloaded intrinsic.
-
-::
-
-      declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b)
-      declare <vscale x 4 x f32> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x f32> %a, <vscale x 8 x f32> %b)
-
-Arguments:
-""""""""""
-
-The first argument is a floating-point vector with the same type as the result.
-
-The second argument is a vector with a length that is a known integer multiple
-of the result's type, while maintaining the same element type.
-
-Semantics:
-""""""""""
-
-As the way in which the arguments to this floating-point intrinsic are reduced is unspecified,
-this intrinsic will reassociate floating-point values, which may result in variations to the
-results due to reordering or by lowering to different instructions.
-
 '``llvm.experimental.vector.histogram.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index dda05ddb4e53e..c19249e31ee4d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12989,6 +12989,12 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
 //
 // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
 // -> partial_reduce_*mla(acc, x, C)
+//
+// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
+// -> partial_reduce_fmla(acc, a, b)
+//
+// partial_reduce_fmla(acc, fmul(fpext(x), splat(C)), splat(1.0))
+// -> partial_reduce_fmla(acc, x, C)
 SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDLoc DL(N);
   auto *Context = DAG.getContext();
@@ -13025,8 +13031,12 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
         CFP->isExactlyValue(1.0)))
     return SDValue();
 
+  auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
+    return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND);
+  };
+
   unsigned LHSOpcode = LHS->getOpcode();
-  if (!ISD::isExtOpcode(LHSOpcode) && LHSOpcode != ISD::FP_EXTEND)
+  if (!IsIntOrFPExtOpcode(LHSOpcode))
     return SDValue();
 
   SDValue LHSExtOp = LHS->getOperand(0);
@@ -13058,7 +13068,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   }
 
   unsigned RHSOpcode = RHS->getOpcode();
-  if (!ISD::isExtOpcode(RHSOpcode) && RHSOpcode != ISD::FP_EXTEND)
+  if (!IsIntOrFPExtOpcode(RHSOpcode))
     return SDValue();
 
   SDValue RHSExtOp = RHS->getOperand(0);
@@ -13104,6 +13114,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
 // partial.reduce.sumla(acc, sext(op), splat(1))
 // -> partial.reduce.smla(acc, op, splat(trunc(1)))
+// partial.reduce.fmla(acc, fpext(op), splat(1.0))
+// -> partial.reduce.fmla(acc, op, splat(1.0))
 SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDLoc DL(N);
   SDValue Acc = N->getOperand(0);
@@ -13120,7 +13132,7 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
-  if (!ISD::isExtOpcode(Op1Opcode))
+  if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
   bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
@@ -13144,8 +13156,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
           TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
     return SDValue();
 
+  auto Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                      ? DAG.getConstantFP(1, DL, UnextOp1VT)
+                      : DAG.getConstant(1, DL, UnextOp1VT);
+
   return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
-                     DAG.getConstant(1, DL, UnextOp1VT));
+                     Constant);
 }
 
 SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 69c0b68f23f78..5e55a1869e95f 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -14,6 +14,18 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
+define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
+; CHECK-LABEL: fdot_splat_vl128:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fmov z2.h, #1.00000000
+; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+  %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
+  ret <vscale x 4 x float> %partial.reduce
+}
+
 define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
 ; CHECK-LABEL: fdot_wide_vl256:
 ; CHECK:       // %bb.0: // %entry

From 023337b2711744b017876214f6ca12462bba70e3 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Mon, 13 Oct 2025 13:24:38 +0000
Subject: [PATCH 08/15] Fixed-length SVE and fix for generating MUL
 instructions from a partial.reduce.fadd

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp    |  9 ++++++++-
 .../lib/Target/AArch64/AArch64ISelLowering.cpp |  8 ++++++++
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll       | 18 ++++--------------
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 3e5e41555c43b..a24741063e472 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12082,7 +12082,14 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
         C->isExactlyValue(1.0)) &&
       !(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
         ConstantOne.isOne()))
-    Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+    switch (N->getOpcode()) {
+    case ISD::PARTIAL_REDUCE_FMLA:
+      Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+      break;
+    default:
+      Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+      break;
+    };
 
   unsigned Stride = AccVT.getVectorMinNumElements();
   unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e719863c3ad99..d7956c59408e5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2294,6 +2294,13 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
                                 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
+  if (Subtarget->hasSVE2p1()) {
+    if (VT.getVectorElementType() == MVT::f32)
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
+                                MVT::getVectorVT(MVT::f16, NumElts * 2),
+                                Custom);
+  }
+
   // Lower fixed length vector operations to scalable equivalents.
   setOperationAction(ISD::ABDS, VT, Default);
   setOperationAction(ISD::ABDU, VT, Default);
@@ -7917,6 +7924,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::PARTIAL_REDUCE_SMLA:
   case ISD::PARTIAL_REDUCE_UMLA:
   case ISD::PARTIAL_REDUCE_SUMLA:
+  case ISD::PARTIAL_REDUCE_FMLA:
     return LowerPARTIAL_REDUCE_MLA(Op, DAG);
   }
 }
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 5e55a1869e95f..cb256851de9c7 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -29,20 +29,10 @@ entry:
 define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
 ; CHECK-LABEL: fdot_wide_vl256:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    ld1h { z0.s }, p0/z, [x1]
-; CHECK-NEXT:    ld1h { z1.s }, p0/z, [x2]
-; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
-; CHECK-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
-; CHECK-NEXT:    fcvt z0.s, p0/m, z0.h
-; CHECK-NEXT:    fcvt z1.s, p0/m, z1.h
-; CHECK-NEXT:    fcvt z2.s, p0/m, z2.h
-; CHECK-NEXT:    fcvt z3.s, p0/m, z3.h
-; CHECK-NEXT:    fmul z0.s, z0.s, z1.s
-; CHECK-NEXT:    ldr z1, [x0]
-; CHECK-NEXT:    fmul z2.s, z2.s, z3.s
-; CHECK-NEXT:    fadd z0.s, z1.s, z0.s
-; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
+; CHECK-NEXT:    ldr z0, [x0]
+; CHECK-NEXT:    ldr z1, [x1]
+; CHECK-NEXT:    ldr z2, [x2]
+; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
 ; CHECK-NEXT:    str z0, [x0]
 ; CHECK-NEXT:    ret
 entry:

From b3c50765576b670010d21af9c1d25b2bdfff6243 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Mon, 13 Oct 2025 15:43:57 +0000
Subject: [PATCH 09/15] Address nits

---
 llvm/docs/LangRef.rst                         | 19 +++++++++---------
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 20 +++++++------------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  8 +++-----
 3 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 9424594f5348f..30e3b64e30eea 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20366,18 +20366,19 @@ The argument to this intrinsic must be a vector of floating-point values.
 Vector Partial Reduction Intrinsics
 -----------------------------------
 
-Partial reductions of vectors can be expressed using the following intrinsics.
-Each one reduces the concatenation of the two vector arguments down to the
-number of elements of the result vector type.
+Partial reductions of vectors can be expressed using the intrinsics described in
+this section. Each one reduces the concatenation of the two vector arguments
+down to the number of elements of the result vector type.
 
-Other than the reduction operator (e.g. add, fadd) the way in which the
+Other than the reduction operator (e.g. add, fadd), the way in which the
 concatenated arguments is reduced is entirely unspecified. By their nature these
-intrinsics are not expected to be useful in isolation but instead implement the
-first phase of an overall reduction operation.
+intrinsics are not expected to be useful in isolation but can instead be used to
+implement the first phase of an overall reduction operation.
 
 The typical use case is loop vectorization where reductions are split into an
 in-loop phase, where maintaining an unordered vector result is important for
-performance, and an out-of-loop phase to calculate the final scalar result.
+performance, and an out-of-loop phase is required to calculate the final scalar
+result.
 
 By avoiding the introduction of new ordering constraints, these intrinsics
 enhance the ability to leverage a target's accumulation instructions.
@@ -20429,9 +20430,7 @@ Semantics:
 
 As the way in which the arguments to this floating-point intrinsic are reduced
 is unspecified, this intrinsic will assume floating-point reassociation and
-contraction, which may result in variations to the results due to reordering or
-by lowering to different instructions (including combining multiple instructions
-into a single one).
+contraction, which may result in variations to the results.
 
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a24741063e472..cff5e3229e067 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12076,20 +12076,14 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
   }
   SDValue Input = MulLHS;
   APInt ConstantOne;
-  ConstantFPSDNode *C;
-  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
-        (C = llvm::isConstOrConstSplatFP(MulRHS, false)) &&
-        C->isExactlyValue(1.0)) &&
-      !(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
-        ConstantOne.isOne()))
-    switch (N->getOpcode()) {
-    case ISD::PARTIAL_REDUCE_FMLA:
+  if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) {
+    ConstantFPSDNode *C = llvm::isConstOrConstSplatFP(MulRHS, false);
+    if (!(C && C->isExactlyValue(1.0)))
       Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
-      break;
-    default:
-      Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
-      break;
-    };
+  } else if (!(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
+               ConstantOne.isOne())) {
+    Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
+  }
 
   unsigned Stride = AccVT.getVectorMinNumElements();
   unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d7956c59408e5..d47e48c529edf 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2294,11 +2294,9 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
                                 MVT::getVectorVT(MVT::i8, NumElts * 8), Custom);
   }
 
-  if (Subtarget->hasSVE2p1()) {
-    if (VT.getVectorElementType() == MVT::f32)
-      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
-                                MVT::getVectorVT(MVT::f16, NumElts * 2),
-                                Custom);
+  if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) {
+    setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT,
+                              MVT::getVectorVT(MVT::f16, NumElts * 2), Custom);
   }
 
   // Lower fixed length vector operations to scalable equivalents.

From 40b9c42864fb3e08f00a189bd2007982fab939e3 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Wed, 15 Oct 2025 15:40:13 +0000
Subject: [PATCH 10/15] Simplify conditionals and further refine documentation

---
 llvm/docs/LangRef.rst                         |  4 +-
 llvm/include/llvm/CodeGen/SelectionDAGNodes.h |  4 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++--
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  5 ++
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  6 +-
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 29 ++++---
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 84 +++++++++++++++----
 7 files changed, 100 insertions(+), 50 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 30e3b64e30eea..67a03478b85ef 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20430,7 +20430,9 @@ Semantics:
 
 As the way in which the arguments to this floating-point intrinsic are reduced
 is unspecified, this intrinsic will assume floating-point reassociation and
-contraction, which may result in variations to the results.
+contraction can be leveraged to implement the reduction, which may result in
+variations to the results due to reordering or by lowering to different
+instructions (including combining multiple instructions into a single one).
 
 '``llvm.vector.insert``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 69713d0d84011..ed5c7572ab592 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1938,6 +1938,10 @@ LLVM_ABI bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false);
 /// be zero.
 LLVM_ABI bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false);
 
+/// Return true if the value is a constant floating-point value, or a splatted
+/// vector of a constant floating-point value, of 1.0 (with no undefs).
+LLVM_ABI bool isOneOrOneSplatFP(SDValue V, bool AllowUndefs = false);
+
 /// Return true if the value is a constant -1 integer or a splatted vector of a
 /// constant -1 integer (with no undefs).
 /// Does not permit build vector implicit truncation.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c19249e31ee4d..d4fc22aa77b0b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13022,13 +13022,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
     Opc = ISD::MUL;
   }
 
-  APInt C;
-  ConstantFPSDNode *CFP;
-  if (!(Op1->getOpcode() == ISD::MUL &&
-        ISD::isConstantSplatVector(Op2.getNode(), C) && C.isOne()) &&
-      !(Op1->getOpcode() == ISD::FMUL &&
-        (CFP = llvm::isConstOrConstSplatFP(Op2, false)) &&
-        CFP->isExactlyValue(1.0)))
+  if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) &&
+      !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2)))
     return SDValue();
 
   auto IsIntOrFPExtOpcode = [](unsigned int Opcode) {
@@ -13044,6 +13039,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
 
   // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1))
   // -> partial_reduce_*mla(acc, x, C)
+  APInt C;
   if (ISD::isConstantSplatVector(RHS.getNode(), C)) {
     // TODO: Make use of partial_reduce_sumla here
     APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits());
@@ -13122,13 +13118,9 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
 
-  APInt ConstantOne;
-  ConstantFPSDNode *C;
   if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
-        (C = llvm::isConstOrConstSplatFP(Op2, false)) &&
-        C->isExactlyValue(1.0)) &&
-      !(ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) &&
-        ConstantOne.isOne()))
+        llvm::isOneOrOneSplatFP(Op2)) &&
+      !llvm::isOneOrOneSplat(Op2))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 0769fa1072888..6f2a009676442 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -13055,6 +13055,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) {
   return C && C->isOne();
 }
 
+bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) {
+  ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs);
+  return C && C->isExactlyValue(1.0);
+}
+
 bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) {
   N = peekThroughBitcasts(N);
   unsigned BitWidth = N.getScalarValueSizeInBits();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index eb5fe4f5328c6..25b2fa9f51c73 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8082,16 +8082,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::vector_partial_reduce_fadd: {
-    if (!TLI.shouldExpandPartialReductionIntrinsic(cast<IntrinsicInst>(&I))) {
-      visitTargetIntrinsic(I, Intrinsic);
-      return;
-    }
     SDValue Acc = getValue(I.getOperand(0));
     SDValue Input = getValue(I.getOperand(1));
     setValue(&I,
              DAG.getNode(ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
                          Input,
-                         DAG.getConstantFP(1.0f, sdl, Input.getValueType())));
+                         DAG.getConstantFP(1.0, sdl, Input.getValueType())));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cff5e3229e067..0d9a133000bd4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -12061,27 +12061,30 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N,
       EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(),
                        MulOpVT.getVectorElementCount());
 
-  unsigned ExtOpcLHS =
-      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
-      : N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA ? ISD::ZERO_EXTEND
-                                                   : ISD::SIGN_EXTEND;
-  unsigned ExtOpcRHS =
-      N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA   ? ISD::FP_EXTEND
-      : N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA ? ISD::SIGN_EXTEND
-                                                   : ISD::ZERO_EXTEND;
+  unsigned ExtOpcLHS, ExtOpcRHS;
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::PARTIAL_REDUCE_UMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_SMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND;
+    break;
+  case ISD::PARTIAL_REDUCE_FMLA:
+    ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND;
+    break;
+  }
 
   if (ExtMulOpVT != MulOpVT) {
     MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS);
     MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS);
   }
   SDValue Input = MulLHS;
-  APInt ConstantOne;
   if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) {
-    ConstantFPSDNode *C = llvm::isConstOrConstSplatFP(MulRHS, false);
-    if (!(C && C->isExactlyValue(1.0)))
+    if (!llvm::isOneOrOneSplatFP(MulRHS))
       Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS);
-  } else if (!(ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) &&
-               ConstantOne.isOne())) {
+  } else if (!llvm::isOneOrOneSplat(MulRHS)) {
     Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS);
   }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index cb256851de9c7..258a372dfdbb2 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -1,11 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
 
 define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; CHECK-LABEL: fdot_wide_vl128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE2-LABEL: fdot_wide_vl128:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z3.s, z1.h
+; SVE2-NEXT:    uunpklo z4.s, z2.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    uunpkhi z2.s, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fcvt z4.s, p0/m, z4.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fmul z3.s, z3.s, z4.s
+; SVE2-NEXT:    fmul z1.s, z1.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z3.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_vl128:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
 entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
@@ -15,11 +33,22 @@ entry:
 }
 
 define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
-; CHECK-LABEL: fdot_splat_vl128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov z2.h, #1.00000000
-; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
-; CHECK-NEXT:    ret
+; SVE2-LABEL: fdot_splat_vl128:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    uunpklo z2.s, z1.h
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    uunpkhi z1.s, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_splat_vl128:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    fmov z2.h, #1.00000000
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    ret
 entry:
   %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
   %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
@@ -27,14 +56,33 @@ entry:
 }
 
 define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
-; CHECK-LABEL: fdot_wide_vl256:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldr z0, [x0]
-; CHECK-NEXT:    ldr z1, [x1]
-; CHECK-NEXT:    ldr z2, [x2]
-; CHECK-NEXT:    fdot z0.s, z1.h, z2.h
-; CHECK-NEXT:    str z0, [x0]
-; CHECK-NEXT:    ret
+; SVE2-LABEL: fdot_wide_vl256:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, z0.s, z1.s
+; SVE2-NEXT:    ldr z1, [x0]
+; SVE2-NEXT:    fmul z2.s, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, z1.s, z0.s
+; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
+; SVE2-NEXT:    str z0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_vl256:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ldr z0, [x0]
+; SVE2P1-NEXT:    ldr z1, [x1]
+; SVE2P1-NEXT:    ldr z2, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    str z0, [x0]
+; SVE2P1-NEXT:    ret
 entry:
   %acc = load <8 x float>, ptr %accptr
   %a = load <16 x half>, ptr %aptr

From daa155efd28397906ff19a62ddc7c91b929f9061 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Wed, 15 Oct 2025 15:58:57 +0000
Subject: [PATCH 11/15] auto -> SDValue

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d4fc22aa77b0b..843ab9a38f3e0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13148,9 +13148,9 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
           TLI.getTypeToTransformTo(*Context, UnextOp1VT)))
     return SDValue();
 
-  auto Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
-                      ? DAG.getConstantFP(1, DL, UnextOp1VT)
-                      : DAG.getConstant(1, DL, UnextOp1VT);
+  SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA
+                         ? DAG.getConstantFP(1, DL, UnextOp1VT)
+                         : DAG.getConstant(1, DL, UnextOp1VT);
 
   return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1,
                      Constant);

From 461f48eab57b50809ef5d7cd8b1179ae7c8c4efb Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 17 Oct 2025 14:03:29 +0000
Subject: [PATCH 12/15] Use PatFrag rather than Pat, move fixed-length SVE
 tests, and test GlobalISel

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  12 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |  10 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   7 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   1 -
 llvm/test/CodeGen/AArch64/sve2p1-fdot.ll      | 119 ++-------
 .../AArch64/sve2p1-fixed-length-fdot.ll       | 230 ++++++++++++++++++
 6 files changed, 258 insertions(+), 121 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 843ab9a38f3e0..f62cc4b7c9110 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12992,9 +12992,6 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) {
 //
 // partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0))
 // -> partial_reduce_fmla(acc, a, b)
-//
-// partial_reduce_fmla(acc, fmul(fpext(x), splat(C)), splat(1.0))
-// -> partial_reduce_fmla(acc, x, C)
 SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) {
   SDLoc DL(N);
   auto *Context = DAG.getContext();
@@ -13118,20 +13115,17 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   SDValue Op1 = N->getOperand(1);
   SDValue Op2 = N->getOperand(2);
 
-  if (!(N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA &&
-        llvm::isOneOrOneSplatFP(Op2)) &&
-      !llvm::isOneOrOneSplat(Op2))
+  if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2))
     return SDValue();
 
   unsigned Op1Opcode = Op1.getOpcode();
   if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND;
+  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
-  if (N->getOpcode() != ISD::PARTIAL_REDUCE_FMLA &&
-      Op1IsSigned != NodeIsSigned &&
+  if (Op1IsSigned != NodeIsSigned &&
       Op1.getValueType().getVectorElementType() != AccElemVT)
     return SDValue();
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d47e48c529edf..abe1af23b4e40 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1921,12 +1921,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal);
       setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal);
     }
-  }
 
-  // Handle floating-point partial reduction
-  if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
-    static const unsigned FMLAOps[] = {ISD::PARTIAL_REDUCE_FMLA};
-    setPartialReduceMLAAction(FMLAOps, MVT::nxv4f32, MVT::nxv8f16, Legal);
+    // Handle floating-point partial reduction
+    if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) {
+      setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32,
+                                MVT::nxv8f16, Legal);
+    }
   }
 
   // Handle non-aliasing elements mask
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 98a128e582866..44628eacc83c0 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
                                node:$Zm)
                                ]>;
 
+def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm),
+                            [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm),
+                             (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm)
+                            ]>;
+
 def SDT_AArch64FCVT : SDTypeProfile<1, 3, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3>
@@ -4251,7 +4256,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>;
 let Predicates = [HasSVE2p1_or_SME2] in {
 defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>;
 
-defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>;
+defm FDOT_ZZZ_S  : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>;
 defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>;
 
 defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index e60a8632d2e8b..3cdd505f12116 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9457,7 +9457,6 @@ multiclass sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty,
                          string asm, ValueType InVT, SDPatternOperator op> {
   def NAME : sve_float_dot<bf, o2, dst_ty, src_ty, asm>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Pat<nxv4f32, partial_reduce_fmla, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
 }
 
 multiclass sve_fp8_dot<bit bf, ZPRRegOp dstrc, string asm, ValueType vt,
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
index 258a372dfdbb2..c055940768dfa 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -1,9 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
 
-define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; SVE2-LABEL: fdot_wide_vl128:
+define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; SVE2-LABEL: fdot_wide_nxv4f32:
 ; SVE2:       // %bb.0: // %entry
 ; SVE2-NEXT:    uunpklo z3.s, z1.h
 ; SVE2-NEXT:    uunpklo z4.s, z2.h
@@ -20,7 +22,7 @@ define <vscale x 4 x float> @fdot_wide_vl128(<vscale x 4 x float> %acc, <vscale
 ; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
 ; SVE2-NEXT:    ret
 ;
-; SVE2P1-LABEL: fdot_wide_vl128:
+; SVE2P1-LABEL: fdot_wide_nxv4f32:
 ; SVE2P1:       // %bb.0: // %entry
 ; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
 ; SVE2P1-NEXT:    ret
@@ -32,8 +34,8 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
-define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
-; SVE2-LABEL: fdot_splat_vl128:
+define <vscale x 4 x float> @fdot_splat_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
+; SVE2-LABEL: fdot_splat_nxv4f32:
 ; SVE2:       // %bb.0: // %entry
 ; SVE2-NEXT:    uunpklo z2.s, z1.h
 ; SVE2-NEXT:    ptrue p0.s
@@ -44,7 +46,7 @@ define <vscale x 4 x float> @fdot_splat_vl128(<vscale x 4 x float> %acc, <vscale
 ; SVE2-NEXT:    fadd z0.s, z0.s, z1.s
 ; SVE2-NEXT:    ret
 ;
-; SVE2P1-LABEL: fdot_splat_vl128:
+; SVE2P1-LABEL: fdot_splat_nxv4f32:
 ; SVE2P1:       // %bb.0: // %entry
 ; SVE2P1-NEXT:    fmov z2.h, #1.00000000
 ; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
@@ -55,101 +57,8 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
-define void @fdot_wide_vl256(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,2) {
-; SVE2-LABEL: fdot_wide_vl256:
-; SVE2:       // %bb.0: // %entry
-; SVE2-NEXT:    ptrue p0.s
-; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
-; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
-; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, #1, mul vl]
-; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, #1, mul vl]
-; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
-; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
-; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
-; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
-; SVE2-NEXT:    fmul z0.s, z0.s, z1.s
-; SVE2-NEXT:    ldr z1, [x0]
-; SVE2-NEXT:    fmul z2.s, z2.s, z3.s
-; SVE2-NEXT:    fadd z0.s, z1.s, z0.s
-; SVE2-NEXT:    fadd z0.s, z0.s, z2.s
-; SVE2-NEXT:    str z0, [x0]
-; SVE2-NEXT:    ret
-;
-; SVE2P1-LABEL: fdot_wide_vl256:
-; SVE2P1:       // %bb.0: // %entry
-; SVE2P1-NEXT:    ldr z0, [x0]
-; SVE2P1-NEXT:    ldr z1, [x1]
-; SVE2P1-NEXT:    ldr z2, [x2]
-; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
-; SVE2P1-NEXT:    str z0, [x0]
-; SVE2P1-NEXT:    ret
-entry:
-  %acc = load <8 x float>, ptr %accptr
-  %a = load <16 x half>, ptr %aptr
-  %b = load <16 x half>, ptr %bptr
-  %a.wide = fpext <16 x half> %a to <16 x float>
-  %b.wide = fpext <16 x half> %b to <16 x float>
-  %mult = fmul <16 x float> %a.wide, %b.wide
-  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
-  store <8 x float> %partial.reduce, ptr %accptr
-  ret void
-}
-
-define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
-; CHECK-LABEL: fixed_fdot_wide:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fcvtl v3.4s, v1.4h
-; CHECK-NEXT:    fcvtl v4.4s, v2.4h
-; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
-; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
-; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
-; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    ret
-entry:
-  %a.wide = fpext <8 x half> %a to <8 x float>
-  %b.wide = fpext <8 x half> %b to <8 x float>
-  %mult = fmul <8 x float> %a.wide, %b.wide
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
-  ret <4 x float> %partial.reduce
-}
-
-define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
-; CHECK-LABEL: partial_reduce_half:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
-entry:
-  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
-  ret <8 x half> %partial.reduce
-}
-
-define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
-; CHECK-LABEL: partial_reduce_float:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
-; CHECK-NEXT:    ret
-entry:
-  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
-  ret <4 x float> %partial.reduce
-}
-
-define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
-; CHECK-LABEL: partial_reduce_double:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
-; CHECK-NEXT:    ret
-entry:
-  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
-  ret <2 x double> %partial.reduce
-}
-
-define <vscale x 8 x half> @partial_reduce_half_vl128(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
-; CHECK-LABEL: partial_reduce_half_vl128:
+define <vscale x 8 x half> @partial_reduce_nxv8f16(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
+; CHECK-LABEL: partial_reduce_nxv8f16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd z0.h, z0.h, z1.h
 ; CHECK-NEXT:    fadd z0.h, z0.h, z2.h
@@ -159,8 +68,8 @@ entry:
   ret <vscale x 8 x half> %partial.reduce
 }
 
-define <vscale x 4 x float> @partial_reduce_float_vl128(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
-; CHECK-LABEL: partial_reduce_float_vl128:
+define <vscale x 4 x float> @partial_reduce_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
+; CHECK-LABEL: partial_reduce_nxv4f32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd z0.s, z0.s, z1.s
 ; CHECK-NEXT:    fadd z0.s, z0.s, z2.s
@@ -170,8 +79,8 @@ entry:
   ret <vscale x 4 x float> %partial.reduce
 }
 
-define <vscale x 2 x double> @partial_reduce_double_vl128(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
-; CHECK-LABEL: partial_reduce_double_vl128:
+define <vscale x 2 x double> @partial_reduce_nxv2f64(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
+; CHECK-LABEL: partial_reduce_nxv2f64:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    fadd z0.d, z0.d, z1.d
 ; CHECK-NEXT:    fadd z0.d, z0.d, z2.d
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
new file mode 100644
index 0000000000000..b07b571413881
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) {
+; SVE2-LABEL: fdot_wide_v8f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl8
+; SVE2-NEXT:    mov x8, #8 // =0x8
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v8f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl8
+; SVE2P1-NEXT:    ptrue p1.h, vl16
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <8 x float>, ptr %accptr
+  %a = load <16 x half>, ptr %aptr
+  %b = load <16 x half>, ptr %bptr
+  %a.wide = fpext <16 x half> %a to <16 x float>
+  %b.wide = fpext <16 x half> %b to <16 x float>
+  %mult = fmul <16 x float> %a.wide, %b.wide
+  %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+  store <8 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v16f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(4,0) {
+; SVE2-LABEL: fdot_wide_v16f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl16
+; SVE2-NEXT:    mov x8, #16 // =0x10
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v16f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl16
+; SVE2P1-NEXT:    ptrue p1.h, vl32
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <16 x float>, ptr %accptr
+  %a = load <32 x half>, ptr %aptr
+  %b = load <32 x half>, ptr %bptr
+  %a.wide = fpext <32 x half> %a to <32 x float>
+  %b.wide = fpext <32 x half> %b to <32 x float>
+  %mult = fmul <32 x float> %a.wide, %b.wide
+  %partial.reduce = call <16 x float> @llvm.vector.partial.reduce.fadd(<16 x float> %acc, <32 x float> %mult)
+  store <16 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v32f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(8,0) {
+; SVE2-LABEL: fdot_wide_v32f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl32
+; SVE2-NEXT:    mov x8, #32 // =0x20
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v32f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl32
+; SVE2P1-NEXT:    ptrue p1.h, vl64
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <32 x float>, ptr %accptr
+  %a = load <64 x half>, ptr %aptr
+  %b = load <64 x half>, ptr %bptr
+  %a.wide = fpext <64 x half> %a to <64 x float>
+  %b.wide = fpext <64 x half> %b to <64 x float>
+  %mult = fmul <64 x float> %a.wide, %b.wide
+  %partial.reduce = call <32 x float> @llvm.vector.partial.reduce.fadd(<32 x float> %acc, <64 x float> %mult)
+  store <32 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define void @fdot_wide_v64f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(16,0) {
+; SVE2-LABEL: fdot_wide_v64f32:
+; SVE2:       // %bb.0: // %entry
+; SVE2-NEXT:    ptrue p0.s, vl64
+; SVE2-NEXT:    mov x8, #64 // =0x40
+; SVE2-NEXT:    ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT:    ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT:    ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT:    ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT:    fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT:    fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT:    fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT:    fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT:    fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT:    fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT:    fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2-NEXT:    ret
+;
+; SVE2P1-LABEL: fdot_wide_v64f32:
+; SVE2P1:       // %bb.0: // %entry
+; SVE2P1-NEXT:    ptrue p0.s, vl64
+; SVE2P1-NEXT:    ptrue p1.h, vl128
+; SVE2P1-NEXT:    ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT:    ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT:    ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT:    fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT:    st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT:    ret
+entry:
+  %acc = load <64 x float>, ptr %accptr
+  %a = load <128 x half>, ptr %aptr
+  %b = load <128 x half>, ptr %bptr
+  %a.wide = fpext <128 x half> %a to <128 x float>
+  %b.wide = fpext <128 x half> %b to <128 x float>
+  %mult = fmul <128 x float> %a.wide, %b.wide
+  %partial.reduce = call <64 x float> @llvm.vector.partial.reduce.fadd(<64 x float> %acc, <128 x float> %mult)
+  store <64 x float> %partial.reduce, ptr %accptr
+  ret void
+}
+
+define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: fixed_fdot_wide:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtl v3.4s, v1.4h
+; CHECK-NEXT:    fcvtl v4.4s, v2.4h
+; CHECK-NEXT:    fcvtl2 v1.4s, v1.8h
+; CHECK-NEXT:    fcvtl2 v2.4s, v2.8h
+; CHECK-NEXT:    fmul v3.4s, v3.4s, v4.4s
+; CHECK-NEXT:    fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+entry:
+  %a.wide = fpext <8 x half> %a to <8 x float>
+  %b.wide = fpext <8 x half> %b to <8 x float>
+  %mult = fmul <8 x float> %a.wide, %b.wide
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+  ret <4 x float> %partial.reduce
+}
+
+define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    fadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+  ret <8 x half> %partial.reduce
+}
+
+define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+  ret <4 x float> %partial.reduce
+}
+
+define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    fadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ret
+entry:
+  %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+  ret <2 x double> %partial.reduce
+}

From 98e35dfab9a5d2f781997b9a6de6c20e6be9ab6f Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Fri, 17 Oct 2025 14:37:13 +0000
Subject: [PATCH 13/15] Fix formatting

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp         | 3 ++-
 llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 7 +++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f62cc4b7c9110..bf17e67276df2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -13122,7 +13122,8 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) {
   if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND)
     return SDValue();
 
-  bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
+  bool Op1IsSigned =
+      Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND;
   bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA;
   EVT AccElemVT = Acc.getValueType().getVectorElementType();
   if (Op1IsSigned != NodeIsSigned &&
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 25b2fa9f51c73..3b7dadbf32124 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8084,10 +8084,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::vector_partial_reduce_fadd: {
     SDValue Acc = getValue(I.getOperand(0));
     SDValue Input = getValue(I.getOperand(1));
-    setValue(&I,
-             DAG.getNode(ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
-                         Input,
-                         DAG.getConstantFP(1.0, sdl, Input.getValueType())));
+    setValue(&I, DAG.getNode(
+                     ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc,
+                     Input, DAG.getConstantFP(1.0, sdl, Input.getValueType())));
     return;
   }
   case Intrinsic::experimental_cttz_elts: {

From 8236481d6b5af26f2f8c720c67fbe523d83e3541 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Wed, 8 Oct 2025 13:48:01 +0000
Subject: [PATCH 14/15] Add LoopVectoriser support for
 `llvm.vector.partial.reduce.fadd`

---
 .../llvm/Analysis/TargetTransformInfo.h       |   7 +-
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   2 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |   6 +-
 .../Transforms/Vectorize/LoopVectorize.cpp    |  13 +-
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  24 ++-
 .../AArch64/partial-reduce-fdot-product.ll    | 159 ++++++++++++++++++
 6 files changed, 199 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5d3b233ed6b6a..08b86f02c48f7 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -222,7 +222,12 @@ class TargetTransformInfoImplBase;
 /// for IR-level transformations.
 class TargetTransformInfo {
 public:
-  enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend };
+  enum PartialReductionExtendKind {
+    PR_None,
+    PR_SignExtend,
+    PR_ZeroExtend,
+    PR_FPExtend
+  };
 
   /// Get the kind of extension that an instruction represents.
   LLVM_ABI static PartialReductionExtendKind
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf62623099a97..b3e677d7d8731 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1005,6 +1005,8 @@ TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
     return PR_SignExtend;
   if (isa<ZExtInst>(I))
     return PR_ZeroExtend;
+  if (isa<FPExtInst>(I))
+    return PR_FPExtend;
   return PR_None;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e3370d31a0e39..497d4fb4d3392 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5640,7 +5640,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
       (!ST->isNeonAvailable() || !ST->hasDotProd()))
     return Invalid;
 
-  if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) ||
+  if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
+       Opcode != Instruction::FAdd) ||
       OpAExtend == TTI::PR_None)
     return Invalid;
 
@@ -5650,7 +5651,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost(
 
   // We only support multiply binary operations for now, and for muls we
   // require the types being extended to be the same.
-  if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB))
+  if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
+                InputTypeA != InputTypeB))
     return Invalid;
 
   bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index adf27bed3d749..64c35bf1d6408 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5313,14 +5313,16 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
   // it is not we return an invalid cost specifying the orignal cost method
   // should be used.
   Instruction *RetI = I;
-  if (match(RetI, m_ZExtOrSExt(m_Value()))) {
+  if (match(RetI, m_ZExtOrSExt(m_Value())) || match(RetI, m_FPExt(m_Value()))) {
     if (!RetI->hasOneUser())
       return std::nullopt;
     RetI = RetI->user_back();
   }
 
-  if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
-      RetI->user_back()->getOpcode() == Instruction::Add) {
+  if ((match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
+       RetI->user_back()->getOpcode() == Instruction::Add) ||
+      (match(RetI, m_OneUse(m_FMul(m_Value(), m_Value()))) &&
+       RetI->user_back()->getOpcode() == Instruction::FAdd)) {
     RetI = RetI->user_back();
   }
 
@@ -7967,7 +7969,8 @@ bool VPRecipeBuilder::getScaledReductions(
         continue;
       }
       Value *ExtOp;
-      if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
+      if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))) &&
+          !match(OpI, m_FPExt(m_Value(ExtOp))))
         return false;
       Exts[I] = cast<Instruction>(OpI);
 
@@ -8138,6 +8141,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
     return nullptr;
 
   unsigned ReductionOpcode = Reduction->getOpcode();
+  if (ReductionOpcode == Instruction::FAdd && !Reduction->hasAllowReassoc())
+    return nullptr;
   if (ReductionOpcode == Instruction::Sub) {
     auto *const Zero = ConstantInt::get(Reduction->getType(), 0);
     SmallVector<VPValue *, 2> Ops;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1f1b42bb9c19f..cd08f95254de5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -322,6 +322,8 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
       return TTI::PR_ZeroExtend;
     if (WidenCastR->getOpcode() == Instruction::CastOps::SExt)
       return TTI::PR_SignExtend;
+    if (WidenCastR->getOpcode() == Instruction::CastOps::FPExt)
+      return TTI::PR_FPExtend;
     return TTI::PR_None;
   };
 
@@ -374,8 +376,9 @@ VPPartialReductionRecipe::computeCost(ElementCount VF,
 void VPPartialReductionRecipe::execute(VPTransformState &State) {
   auto &Builder = State.Builder;
 
-  assert(getOpcode() == Instruction::Add &&
-         "Unhandled partial reduction opcode");
+  assert(
+      (getOpcode() == Instruction::Add || getOpcode() == Instruction::FAdd) &&
+      "Unhandled partial reduction opcode");
 
   Value *BinOpVal = State.get(getOperand(1));
   Value *PhiVal = State.get(getOperand(0));
@@ -383,9 +386,20 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
 
   Type *RetTy = PhiVal->getType();
 
-  CallInst *V =
-      Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add,
-                              {PhiVal, BinOpVal}, nullptr, "partial.reduce");
+  enum llvm::Intrinsic::IndependentIntrinsics PRIntrinsic;
+  switch (getOpcode()) {
+  case Instruction::Add: {
+    PRIntrinsic = Intrinsic::vector_partial_reduce_add;
+    break;
+  }
+  case Instruction::FAdd: {
+    PRIntrinsic = Intrinsic::vector_partial_reduce_fadd;
+    break;
+  }
+  }
+
+  CallInst *V = Builder.CreateIntrinsic(RetTy, PRIntrinsic, {PhiVal, BinOpVal},
+                                        nullptr, "partial.reduce");
 
   State.set(this, V);
 }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll
new file mode 100644
index 0000000000000..8de64e5ec7504
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll
@@ -0,0 +1,159 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define float @fdotp(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define float @fdotp(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP2]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = fmul <8 x float> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    ret float [[TMP7]]
+;
+; CHECK-INTERLEAVED-LABEL: define float @fdotp(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[TMP0]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr half, ptr [[TMP4]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x half>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x half>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD3]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = fpext <8 x half> [[WIDE_LOAD4]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = fmul <8 x float> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = fmul <8 x float> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI1]], <8 x float> [[TMP9]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    ret float [[TMP11]]
+;
+; CHECK-MAXBW-LABEL: define float @fdotp(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP6]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP0]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP2]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x float> [[TMP7]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32(<vscale x 4 x float> [[VEC_PHI]], <vscale x 8 x float> [[TMP8]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 1
+  %ext.a = fpext half %load.a to float
+  %gep.b = getelementptr half, ptr %b, i64 %iv
+  %load.b = load half, ptr %gep.b, align 1
+  %ext.b = fpext half %load.b to float
+  %mul = fmul float %ext.b, %ext.a
+  %add = fadd reassoc float %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret float %add
+}
+;.
+; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+;.
+; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

From 70050494fead23017dbe51a8fdd2330a861da254 Mon Sep 17 00:00:00 2001
From: Damian Heaton <Damian.Heaton@arm.com>
Date: Tue, 21 Oct 2025 15:30:57 +0000
Subject: [PATCH 15/15] Remove extraneous changes & add 'not_' tests

---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   8 +-
 .../AArch64/partial-reduce-fdot-product.ll    | 942 ++++++++++++++++++
 2 files changed, 945 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 64c35bf1d6408..129083eb0eb27 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5313,16 +5313,14 @@ LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
   // it is not we return an invalid cost specifying the orignal cost method
   // should be used.
   Instruction *RetI = I;
-  if (match(RetI, m_ZExtOrSExt(m_Value())) || match(RetI, m_FPExt(m_Value()))) {
+  if (match(RetI, m_ZExtOrSExt(m_Value()))) {
     if (!RetI->hasOneUser())
       return std::nullopt;
     RetI = RetI->user_back();
   }
 
-  if ((match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
-       RetI->user_back()->getOpcode() == Instruction::Add) ||
-      (match(RetI, m_OneUse(m_FMul(m_Value(), m_Value()))) &&
-       RetI->user_back()->getOpcode() == Instruction::FAdd)) {
+  if (match(RetI, m_OneUse(m_Mul(m_Value(), m_Value()))) &&
+      RetI->user_back()->getOpcode() == Instruction::Add) {
     RetI = RetI->user_back();
   }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll
index 8de64e5ec7504..f748ec2744ee8 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll
@@ -143,17 +143,959 @@ for.body:                                         ; preds = %for.body, %entry
 for.exit:                        ; preds = %for.body
   ret float %add
 }
+
+define double @not_fdotp_different_types(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define double @not_fdotp_different_types(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double>
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = fpext <8 x float> [[WIDE_LOAD1]] to <8 x double>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = fmul <8 x double> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5]] = fadd reassoc <8 x double> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    ret double [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define double @not_fdotp_different_types(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*:]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x double> [ <double 0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[GEP_A]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[GEP_B]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP5]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = fpext <8 x float> [[WIDE_LOAD3]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = fpext <8 x float> [[WIDE_LOAD4]] to <8 x double>
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = fmul <8 x double> [[TMP6]], [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = fmul <8 x double> [[TMP7]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10]] = fadd reassoc <8 x double> [[TMP8]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11]] = fadd reassoc <8 x double> [[TMP9]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 16
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <8 x double> [[TMP11]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    ret double [[ADD_LCSSA]]
+;
+; CHECK-MAXBW-LABEL: define double @not_fdotp_different_types(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x double> [ insertelement (<vscale x 8 x double> splat (double -0.000000e+00), double 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x double>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, ptr [[TMP6]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x float> [[WIDE_LOAD1]] to <vscale x 8 x double>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x double> [[TMP7]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP9]] = fadd reassoc <vscale x 8 x double> [[TMP8]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = call reassoc double @llvm.vector.reduce.fadd.nxv8f64(double -0.000000e+00, <vscale x 8 x double> [[TMP9]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to double
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load float, ptr [[GEP_B]], align 2
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext float [[LOAD_B]] to double
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul double [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc double [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi double [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    ret double [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi double [ 0.0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 1
+  %ext.a = fpext half %load.a to double
+  %gep.b = getelementptr float, ptr %b, i64 %iv
+  %load.b = load float, ptr %gep.b, align 2
+  %ext.b = fpext float %load.b to double
+  %mul = fmul double %ext.b, %ext.a
+  %add = fadd reassoc double %mul, %accum
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret double %add
+}
+
+define float @not_fdotp_not_phi(ptr %a, ptr %b) #0 {
+; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_not_phi(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x float> [[TMP7]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = fadd reassoc <vscale x 8 x float> [[TMP8]], [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x float> [[TMP9]], i32 [[TMP13]]
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret float [[ADD_LCSSA]]
+;
+; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_not_phi(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr half, ptr [[TMP4]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[TMP9]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = fmul <vscale x 8 x float> [[TMP13]], [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = fadd reassoc <vscale x 8 x float> [[TMP14]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sub i32 [[TMP18]], 1
+; CHECK-INTERLEAVED-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x float> [[TMP15]], i32 [[TMP19]]
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[FOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    ret float [[ADD_LCSSA]]
+;
+; CHECK-MAXBW-LABEL: define float @not_fdotp_not_phi(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP6]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD1]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = fmul <vscale x 8 x float> [[TMP7]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = fadd reassoc <vscale x 8 x float> [[TMP8]], [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sub i32 [[TMP12]], 1
+; CHECK-MAXBW-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 8 x float> [[TMP9]], i32 [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    ret float [[ADD_LCSSA]]
+;
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %accum = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  %gep.a = getelementptr half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 1
+  %ext.a = fpext half %load.a to float
+  %gep.b = getelementptr half, ptr %b, i64 %iv
+  %load.b = load half, ptr %gep.b, align 1
+  %ext.b = fpext half %load.b to float
+  %mul = fmul float %ext.b, %ext.a
+  %add = fadd reassoc float %mul, %ext.b
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                        ; preds = %for.body
+  ret float %add
+}
+
+define void @not_fdotp_not_phi2(ptr %matrix, i32 %n) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @not_fdotp_not_phi2(
+; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_PREHEADER]]:
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load half, ptr null, align 1
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-INTERLEAVE1-NEXT:    [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]]
+; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]]
+; CHECK-INTERLEAVE1-NEXT:    [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVE1:       [[EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ]
+; CHECK-INTERLEAVE1-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
+; CHECK-INTERLEAVE1-NEXT:    ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @not_fdotp_not_phi2(
+; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-INTERLEAVED:       [[FOR_PREHEADER]]:
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load half, ptr null, align 1
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-INTERLEAVED-NEXT:    [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVED-NEXT:    [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP70:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP71:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 128
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 160
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 192
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 224
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP8]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = load half, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = load half, ptr [[TMP13]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = load half, ptr [[TMP14]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = load half, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = insertelement <4 x half> poison, half [[TMP28]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = insertelement <4 x half> [[TMP34]], half [[TMP31]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = load half, ptr [[TMP16]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = load half, ptr [[TMP17]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = load half, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = load half, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = insertelement <4 x half> poison, half [[TMP36]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = insertelement <4 x half> [[TMP40]], half [[TMP37]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = insertelement <4 x half> [[TMP41]], half [[TMP38]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = insertelement <4 x half> [[TMP42]], half [[TMP39]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = fpext <4 x half> [[TMP35]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = fpext <4 x half> [[TMP43]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP45]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = fadd reassoc <4 x float> [[TMP46]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = fadd reassoc <4 x float> [[TMP47]], [[VEC_PHI3]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = load half, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = load half, ptr [[TMP21]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP52:%.*]] = load half, ptr [[TMP22]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP53:%.*]] = load half, ptr [[TMP23]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP54:%.*]] = insertelement <4 x half> poison, half [[TMP50]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP55:%.*]] = insertelement <4 x half> [[TMP54]], half [[TMP51]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = insertelement <4 x half> [[TMP55]], half [[TMP52]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP57:%.*]] = insertelement <4 x half> [[TMP56]], half [[TMP53]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP58:%.*]] = load half, ptr [[TMP24]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP59:%.*]] = load half, ptr [[TMP25]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP60:%.*]] = load half, ptr [[TMP26]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP61:%.*]] = load half, ptr [[TMP27]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP62:%.*]] = insertelement <4 x half> poison, half [[TMP58]], i32 0
+; CHECK-INTERLEAVED-NEXT:    [[TMP63:%.*]] = insertelement <4 x half> [[TMP62]], half [[TMP59]], i32 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP64:%.*]] = insertelement <4 x half> [[TMP63]], half [[TMP60]], i32 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP65:%.*]] = insertelement <4 x half> [[TMP64]], half [[TMP61]], i32 3
+; CHECK-INTERLEAVED-NEXT:    [[TMP66:%.*]] = fpext <4 x half> [[TMP57]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP67:%.*]] = fpext <4 x half> [[TMP65]] to <4 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP68:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP66]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP69:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP67]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP70]] = fadd reassoc <4 x float> [[TMP68]], [[TMP48]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP71]] = fadd reassoc <4 x float> [[TMP69]], [[TMP49]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP71]], [[TMP70]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP73:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP73]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[FOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL11]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1
+; CHECK-INTERLEAVED-NEXT:    [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]]
+; CHECK-INTERLEAVED-NEXT:    [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-INTERLEAVED:       [[FOR_EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP73]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVED:       [[EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ]
+; CHECK-INTERLEAVED-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
+; CHECK-INTERLEAVED-NEXT:    ret void
+;
+; CHECK-MAXBW-LABEL: define void @not_fdotp_not_phi2(
+; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]]
+; CHECK-MAXBW:       [[FOR_PREHEADER]]:
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr null, align 1
+; CHECK-MAXBW-NEXT:    [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1
+; CHECK-MAXBW-NEXT:    [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul i64 [[N_VEC]], 32
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]]
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64
+; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]]
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float>
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]]
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float>
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]]
+; CHECK-MAXBW-NEXT:    [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1
+; CHECK-MAXBW-NEXT:    [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2
+; CHECK-MAXBW-NEXT:    [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1
+; CHECK-MAXBW-NEXT:    [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]]
+; CHECK-MAXBW-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1
+; CHECK-MAXBW-NEXT:    [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float
+; CHECK-MAXBW-NEXT:    [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]]
+; CHECK-MAXBW-NEXT:    [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]]
+; CHECK-MAXBW-NEXT:    [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-MAXBW-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW:       [[FOR_EXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    br label %[[EXIT]]
+; CHECK-MAXBW:       [[EXIT]]:
+; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ]
+; CHECK-MAXBW-NEXT:    store float [[RESULT]], ptr [[MATRIX]], align 4
+; CHECK-MAXBW-NEXT:    ret void
+;
+entry:
+  %cmp = icmp sgt i32 %n, 0
+  br i1 %cmp, label %for.preheader, label %exit
+
+for.preheader:                   ; preds = %entry
+  %load.a = load half, ptr inttoptr (i64 0 to ptr), align 1
+  %load.a1 = load half, ptr inttoptr (i64 1 to ptr), align 1
+  %a.ext = fpext half %load.a to float
+  %a.ext1 = fpext half %load.a1 to float
+  br label %for.body
+
+for.body:                             ; preds = %for.preheader, %for.body
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ]
+  %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ]
+  %accum = phi float [ %add.1, %for.body ], [ 0.0, %for.preheader ]
+  %gep.b = getelementptr half, ptr %ptr, i64 1
+  %gep.b1 = getelementptr half, ptr %ptr, i64 2
+  %load.b = load half, ptr %gep.b, align 1
+  %b.ext = fpext half %load.b to float
+  %mul = fmul float %a.ext, %b.ext
+  %add = fadd reassoc float %mul, %accum
+  %load.b1 = load half, ptr %gep.b1, align 1
+  %b.ext1 = fpext half %load.b1 to float
+  %mul.1 = fmul float %a.ext1, %b.ext1
+  %add.1 = fadd reassoc float %mul.1, %add
+  %scevgep = getelementptr half, ptr %ptr, i64 16
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %iv.next, %n
+  br i1 %exitcond.not, label %for.exit, label %for.body
+
+for.exit:                       ; preds = %for.body
+  %add.float = phi float [ %add.1, %for.body ]
+  br label %exit
+
+exit:                                ; preds = %for.exit, %entry
+  %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ]
+  store float %result, ptr %matrix, align 4
+  ret void
+}
+
+define float @not_fdotp_ext_outside_plan(ptr %a, half %b, i64 %n) #0 {
+; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_ext_outside_plan(
+; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVE1-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_B:%.*]] = fpext half [[B]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVE1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = fmul <8 x float> [[TMP1]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3]] = fadd reassoc <8 x float> [[TMP2]], [[VEC_PHI]]
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVE1:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVE1-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVE1:       [[FOR_BODY]]:
+; CHECK-INTERLEAVE1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVE1-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2
+; CHECK-INTERLEAVE1-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVE1-NEXT:    [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]]
+; CHECK-INTERLEAVE1-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVE1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-INTERLEAVE1-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVE1:       [[EXIT_LOOPEXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVE1-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVE1:       [[EXIT]]:
+; CHECK-INTERLEAVE1-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-INTERLEAVE1-NEXT:    ret float [[RESULT]]
+;
+; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_ext_outside_plan(
+; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT:  [[ENTRY:.*]]:
+; CHECK-INTERLEAVED-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[FOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[EXT_B:%.*]] = fpext half [[B]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0
+; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer
+; CHECK-INTERLEAVED-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[VECTOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds half, ptr [[TMP0]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = fmul <8 x float> [[TMP2]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = fmul <8 x float> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP6]] = fadd reassoc <8 x float> [[TMP4]], [[VEC_PHI]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP7]] = fadd reassoc <8 x float> [[TMP5]], [[VEC_PHI1]]
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-INTERLEAVED:       [[MIDDLE_BLOCK]]:
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = fadd reassoc <8 x float> [[TMP7]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-INTERLEAVED:       [[SCALAR_PH]]:
+; CHECK-INTERLEAVED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-INTERLEAVED:       [[FOR_BODY]]:
+; CHECK-INTERLEAVED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
+; CHECK-INTERLEAVED-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-INTERLEAVED-NEXT:    [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]]
+; CHECK-INTERLEAVED-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-INTERLEAVED-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-INTERLEAVED-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVED:       [[EXIT_LOOPEXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-INTERLEAVED-NEXT:    br label %[[EXIT]]
+; CHECK-INTERLEAVED:       [[EXIT]]:
+; CHECK-INTERLEAVED-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-INTERLEAVED-NEXT:    ret float [[RESULT]]
+;
+; CHECK-MAXBW-LABEL: define float @not_fdotp_ext_outside_plan(
+; CHECK-MAXBW-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAXBW-NEXT:    [[CMP:%.*]] = icmp eq i64 [[N]], 0
+; CHECK-MAXBW-NEXT:    br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]]
+; CHECK-MAXBW:       [[FOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[EXT_B:%.*]] = fpext half [[B]] to float
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAXBW:       [[VECTOR_PH]]:
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x float> poison, float [[EXT_B]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x float> [[BROADCAST_SPLATINSERT]], <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer
+; CHECK-MAXBW-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAXBW:       [[VECTOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = fpext <vscale x 8 x half> [[WIDE_LOAD]] to <vscale x 8 x float>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = fmul <vscale x 8 x float> [[TMP5]], [[BROADCAST_SPLAT]]
+; CHECK-MAXBW-NEXT:    [[TMP7]] = fadd reassoc <vscale x 8 x float> [[TMP6]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-MAXBW:       [[MIDDLE_BLOCK]]:
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[TMP7]])
+; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-MAXBW:       [[SCALAR_PH]]:
+; CHECK-MAXBW-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ]
+; CHECK-MAXBW-NEXT:    [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ]
+; CHECK-MAXBW-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAXBW:       [[FOR_BODY]]:
+; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]]
+; CHECK-MAXBW-NEXT:    [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2
+; CHECK-MAXBW-NEXT:    [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float
+; CHECK-MAXBW-NEXT:    [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]]
+; CHECK-MAXBW-NEXT:    [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]]
+; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-MAXBW-NEXT:    [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-MAXBW-NEXT:    br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW:       [[EXIT_LOOPEXIT]]:
+; CHECK-MAXBW-NEXT:    [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAXBW-NEXT:    br label %[[EXIT]]
+; CHECK-MAXBW:       [[EXIT]]:
+; CHECK-MAXBW-NEXT:    [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ]
+; CHECK-MAXBW-NEXT:    ret float [[RESULT]]
+;
+entry:
+  %cmp = icmp eq i64 %n, 0
+  br i1 %cmp, label %exit, label %for.ph
+
+for.ph:                                   ; preds = %entry
+  %ext.b = fpext half %b to float
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ]
+  %accum = phi float [ 0.0, %for.ph ], [ %add, %for.body ]
+  %gep.a = getelementptr inbounds half, ptr %a, i64 %iv
+  %load.a = load half, ptr %gep.a, align 2
+  %ext.a = fpext half %load.a to float
+  %mul = fmul float %ext.a, %ext.b
+  %add = fadd reassoc float %mul, %accum
+  %iv.next = add nuw nsw i64 %iv, 1
+  %cmp.1 = icmp eq i64 %iv.next, %n
+  br i1 %cmp.1, label %exit, label %for.body
+
+exit:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %result = phi float [ 0.0, %entry ], [ %add, %for.body ]
+  ret float %result
+}
+
+
 ;.
 ; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
 ;.
 ; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
 ;.
 ; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
 ;.