diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 033910121a54f..67a03478b85ef 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -20363,6 +20363,77 @@ Arguments: """""""""" The argument to this intrinsic must be a vector of floating-point values. +Vector Partial Reduction Intrinsics +----------------------------------- + +Partial reductions of vectors can be expressed using the intrinsics described in +this section. Each one reduces the concatenation of the two vector arguments +down to the number of elements of the result vector type. + +Other than the reduction operator (e.g. add, fadd), the way in which the +concatenated arguments is reduced is entirely unspecified. By their nature these +intrinsics are not expected to be useful in isolation but can instead be used to +implement the first phase of an overall reduction operation. + +The typical use case is loop vectorization where reductions are split into an +in-loop phase, where maintaining an unordered vector result is important for +performance, and an out-of-loop phase is required to calculate the final scalar +result. + +By avoiding the introduction of new ordering constraints, these intrinsics +enhance the ability to leverage a target's accumulation instructions. + +'``llvm.vector.partial.reduce.add.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b) + declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b) + declare @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %a, %b) + declare @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %a, %b) + +Arguments: +"""""""""" + +The first argument is an integer vector with the same type as the result. + +The second argument is a vector with a length that is a known integer multiple +of the result's type, while maintaining the same element type. + +'``llvm.vector.partial.reduce.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x f32> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x f32> %a, <8 x f32> %b) + declare @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32( %a, %b) + +Arguments: +"""""""""" + +The first argument is a floating-point vector with the same type as the result. + +The second argument is a vector with a length that is a known integer multiple +of the result's type, while maintaining the same element type. + +Semantics: +"""""""""" + +As the way in which the arguments to this floating-point intrinsic are reduced +is unspecified, this intrinsic will assume floating-point reassociation and +contraction can be leveraged to implement the reduction, which may result in +variations to the results due to reordering or by lowering to different +instructions (including combining multiple instructions into a single one). + '``llvm.vector.insert``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -20736,50 +20807,6 @@ Note that it has the following implications: - If ``%cnt`` is non-zero, the return value is non-zero as well. - If ``%cnt`` is less than or equal to ``%max_lanes``, the return value is equal to ``%cnt``. -'``llvm.vector.partial.reduce.add.*``' Intrinsic -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Syntax: -""""""" -This is an overloaded intrinsic. - -:: - - declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v8i32(<4 x i32> %a, <8 x i32> %b) - declare <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v4i32.v16i32(<4 x i32> %a, <16 x i32> %b) - declare @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv8i32( %a, %b) - declare @llvm.vector.partial.reduce.add.nxv4i32.nxv4i32.nxv16i32( %a, %b) - -Overview: -""""""""" - -The '``llvm.vector.partial.reduce.add.*``' intrinsics reduce the -concatenation of the two vector arguments down to the number of elements of the -result vector type. - -Arguments: -"""""""""" - -The first argument is an integer vector with the same type as the result. - -The second argument is a vector with a length that is a known integer multiple -of the result's type, while maintaining the same element type. - -Semantics: -"""""""""" - -Other than the reduction operator (e.g. add) the way in which the concatenated -arguments is reduced is entirely unspecified. By their nature these intrinsics -are not expected to be useful in isolation but instead implement the first phase -of an overall reduction operation. - -The typical use case is loop vectorization where reductions are split into an -in-loop phase, where maintaining an unordered vector result is important for -performance, and an out-of-loop phase to calculate the final scalar result. - -By avoiding the introduction of new ordering constraints, these intrinsics -enhance the ability to leverage a target's accumulation instructions. - '``llvm.experimental.vector.histogram.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 5d3b233ed6b6a..08b86f02c48f7 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -222,7 +222,12 @@ class TargetTransformInfoImplBase; /// for IR-level transformations. class TargetTransformInfo { public: - enum PartialReductionExtendKind { PR_None, PR_SignExtend, PR_ZeroExtend }; + enum PartialReductionExtendKind { + PR_None, + PR_SignExtend, + PR_ZeroExtend, + PR_FPExtend + }; /// Get the kind of extension that an instruction represents. LLVM_ABI static PartialReductionExtendKind diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index ff3dd0d4c3c51..1a3fd27e64c4f 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1516,6 +1516,7 @@ enum NodeType { PARTIAL_REDUCE_SMLA, // sext, sext PARTIAL_REDUCE_UMLA, // zext, zext PARTIAL_REDUCE_SUMLA, // sext, zext + PARTIAL_REDUCE_FMLA, // fpext, fpext // The `llvm.experimental.stackmap` intrinsic. // Operands: input chain, glue, , , [live0[, live1...]] diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h index 69713d0d84011..ed5c7572ab592 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1938,6 +1938,10 @@ LLVM_ABI bool isNullOrNullSplat(SDValue V, bool AllowUndefs = false); /// be zero. LLVM_ABI bool isOneOrOneSplat(SDValue V, bool AllowUndefs = false); +/// Return true if the value is a constant floating-point value, or a splatted +/// vector of a constant floating-point value, of 1.0 (with no undefs). +LLVM_ABI bool isOneOrOneSplatFP(SDValue V, bool AllowUndefs = false); + /// Return true if the value is a constant -1 integer or a splatted vector of a /// constant -1 integer (with no undefs). /// Does not permit build vector implicit truncation. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 73f2c55a71125..92405bebbca46 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1664,7 +1664,7 @@ class LLVM_ABI TargetLoweringBase { LegalizeAction getPartialReduceMLAAction(unsigned Opc, EVT AccVT, EVT InputVT) const { assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA || - Opc == ISD::PARTIAL_REDUCE_SUMLA); + Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA); PartialReduceActionTypes Key = {Opc, AccVT.getSimpleVT().SimpleTy, InputVT.getSimpleVT().SimpleTy}; auto It = PartialReduceMLAActions.find(Key); @@ -2766,7 +2766,7 @@ class LLVM_ABI TargetLoweringBase { void setPartialReduceMLAAction(unsigned Opc, MVT AccVT, MVT InputVT, LegalizeAction Action) { assert(Opc == ISD::PARTIAL_REDUCE_SMLA || Opc == ISD::PARTIAL_REDUCE_UMLA || - Opc == ISD::PARTIAL_REDUCE_SUMLA); + Opc == ISD::PARTIAL_REDUCE_SUMLA || Opc == ISD::PARTIAL_REDUCE_FMLA); assert(AccVT.isValid() && InputVT.isValid() && "setPartialReduceMLAAction types aren't valid"); PartialReduceActionTypes Key = {Opc, AccVT.SimpleTy, InputVT.SimpleTy}; diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 12d1c2528f977..5163d2052561e 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2799,6 +2799,10 @@ def int_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]>; +def int_vector_partial_reduce_fadd : DefaultAttrsIntrinsic<[LLVMMatchType<0>], + [llvm_anyfloat_ty, llvm_anyfloat_ty], + [IntrNoMem]>; + //===----------------- Pointer Authentication Intrinsics ------------------===// // diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 07a858fd682fc..a9750a5ab03f9 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -527,6 +527,8 @@ def partial_reduce_smla : SDNode<"ISD::PARTIAL_REDUCE_SMLA", SDTPartialReduceMLA>; def partial_reduce_sumla : SDNode<"ISD::PARTIAL_REDUCE_SUMLA", SDTPartialReduceMLA>; +def partial_reduce_fmla : SDNode<"ISD::PARTIAL_REDUCE_FMLA", + SDTPartialReduceMLA>; def fadd : SDNode<"ISD::FADD" , SDTFPBinOp, [SDNPCommutative]>; def fsub : SDNode<"ISD::FSUB" , SDTFPBinOp>; diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index bf62623099a97..b3e677d7d8731 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1005,6 +1005,8 @@ TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) { return PR_SignExtend; if (isa(I)) return PR_ZeroExtend; + if (isa(I)) + return PR_FPExtend; return PR_None; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 310d35d9b1d1e..bf17e67276df2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2042,6 +2042,7 @@ SDValue DAGCombiner::visit(SDNode *N) { case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: return visitPARTIAL_REDUCE_MLA(N); case ISD::VECTOR_COMPRESS: return visitVECTOR_COMPRESS(N); case ISD::LIFETIME_END: return visitLIFETIME_END(N); @@ -12988,6 +12989,9 @@ SDValue DAGCombiner::visitPARTIAL_REDUCE_MLA(SDNode *N) { // // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) +// +// partial_reduce_fmla(acc, fmul(fpext(a), fpext(b)), splat(1.0)) +// -> partial_reduce_fmla(acc, a, b) SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDLoc DL(N); auto *Context = DAG.getContext(); @@ -12996,7 +13000,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { SDValue Op2 = N->getOperand(2); unsigned Opc = Op1->getOpcode(); - if (Opc != ISD::MUL && Opc != ISD::SHL) + if (Opc != ISD::MUL && Opc != ISD::FMUL && Opc != ISD::SHL) return SDValue(); SDValue LHS = Op1->getOperand(0); @@ -13015,13 +13019,16 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { Opc = ISD::MUL; } - APInt C; - if (Opc != ISD::MUL || !ISD::isConstantSplatVector(Op2.getNode(), C) || - !C.isOne()) + if (!(Opc == ISD::MUL && llvm::isOneOrOneSplat(Op2)) && + !(Opc == ISD::FMUL && llvm::isOneOrOneSplatFP(Op2))) return SDValue(); + auto IsIntOrFPExtOpcode = [](unsigned int Opcode) { + return (ISD::isExtOpcode(Opcode) || Opcode == ISD::FP_EXTEND); + }; + unsigned LHSOpcode = LHS->getOpcode(); - if (!ISD::isExtOpcode(LHSOpcode)) + if (!IsIntOrFPExtOpcode(LHSOpcode)) return SDValue(); SDValue LHSExtOp = LHS->getOperand(0); @@ -13029,6 +13036,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { // partial_reduce_*mla(acc, mul(ext(x), splat(C)), splat(1)) // -> partial_reduce_*mla(acc, x, C) + APInt C; if (ISD::isConstantSplatVector(RHS.getNode(), C)) { // TODO: Make use of partial_reduce_sumla here APInt CTrunc = C.trunc(LHSExtOpVT.getScalarSizeInBits()); @@ -13053,7 +13061,7 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { } unsigned RHSOpcode = RHS->getOpcode(); - if (!ISD::isExtOpcode(RHSOpcode)) + if (!IsIntOrFPExtOpcode(RHSOpcode)) return SDValue(); SDValue RHSExtOp = RHS->getOperand(0); @@ -13070,6 +13078,8 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { else if (LHSOpcode == ISD::ZERO_EXTEND && RHSOpcode == ISD::SIGN_EXTEND) { NewOpc = ISD::PARTIAL_REDUCE_SUMLA; std::swap(LHSExtOp, RHSExtOp); + } else if (LHSOpcode == ISD::FP_EXTEND && RHSOpcode == ISD::FP_EXTEND) { + NewOpc = ISD::PARTIAL_REDUCE_FMLA; } else return SDValue(); // For a 2-stage extend the signedness of both of the extends must match @@ -13097,30 +13107,33 @@ SDValue DAGCombiner::foldPartialReduceMLAMulOp(SDNode *N) { // -> partial.reduce.smla(acc, op, splat(trunc(1))) // partial.reduce.sumla(acc, sext(op), splat(1)) // -> partial.reduce.smla(acc, op, splat(trunc(1))) +// partial.reduce.fmla(acc, fpext(op), splat(1.0)) +// -> partial.reduce.fmla(acc, op, splat(1.0)) SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { SDLoc DL(N); SDValue Acc = N->getOperand(0); SDValue Op1 = N->getOperand(1); SDValue Op2 = N->getOperand(2); - APInt ConstantOne; - if (!ISD::isConstantSplatVector(Op2.getNode(), ConstantOne) || - !ConstantOne.isOne()) + if (!llvm::isOneOrOneSplat(Op2) && !llvm::isOneOrOneSplatFP(Op2)) return SDValue(); unsigned Op1Opcode = Op1.getOpcode(); - if (!ISD::isExtOpcode(Op1Opcode)) + if (!ISD::isExtOpcode(Op1Opcode) && Op1Opcode != ISD::FP_EXTEND) return SDValue(); - bool Op1IsSigned = Op1Opcode == ISD::SIGN_EXTEND; + bool Op1IsSigned = + Op1Opcode == ISD::SIGN_EXTEND || Op1Opcode == ISD::FP_EXTEND; bool NodeIsSigned = N->getOpcode() != ISD::PARTIAL_REDUCE_UMLA; EVT AccElemVT = Acc.getValueType().getVectorElementType(); if (Op1IsSigned != NodeIsSigned && Op1.getValueType().getVectorElementType() != AccElemVT) return SDValue(); - unsigned NewOpcode = - Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA : ISD::PARTIAL_REDUCE_UMLA; + unsigned NewOpcode = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? ISD::PARTIAL_REDUCE_FMLA + : Op1IsSigned ? ISD::PARTIAL_REDUCE_SMLA + : ISD::PARTIAL_REDUCE_UMLA; SDValue UnextOp1 = Op1.getOperand(0); EVT UnextOp1VT = UnextOp1.getValueType(); @@ -13130,8 +13143,12 @@ SDValue DAGCombiner::foldPartialReduceAdd(SDNode *N) { TLI.getTypeToTransformTo(*Context, UnextOp1VT))) return SDValue(); + SDValue Constant = N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA + ? DAG.getConstantFP(1, DL, UnextOp1VT) + : DAG.getConstant(1, DL, UnextOp1VT); + return DAG.getNode(NewOpcode, DL, N->getValueType(0), Acc, UnextOp1, - DAG.getConstant(1, DL, UnextOp1VT)); + Constant); } SDValue DAGCombiner::visitVP_STRIDED_LOAD(SDNode *N) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 8e423c4f83b38..94751be5b7986 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -534,6 +534,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Action = TLI.getPartialReduceMLAAction(Op.getOpcode(), Node->getValueType(0), Node->getOperand(1).getValueType()); @@ -1243,6 +1244,7 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Results.push_back(TLI.expandPartialReduceMLA(Node, DAG)); return; case ISD::VECREDUCE_SEQ_FADD: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 3b5f83f7c089a..4b40b32621418 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1459,6 +1459,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: SplitVecRes_PARTIAL_REDUCE_MLA(N, Lo, Hi); break; case ISD::GET_ACTIVE_LANE_MASK: @@ -3674,6 +3675,7 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: Res = SplitVecOp_PARTIAL_REDUCE_MLA(N); break; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 90edaf3ef5471..6f2a009676442 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -8404,7 +8404,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SMLA: - case ISD::PARTIAL_REDUCE_SUMLA: { + case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: { [[maybe_unused]] EVT AccVT = N1.getValueType(); [[maybe_unused]] EVT Input1VT = N2.getValueType(); [[maybe_unused]] EVT Input2VT = N3.getValueType(); @@ -13054,6 +13055,11 @@ bool llvm::isOneOrOneSplat(SDValue N, bool AllowUndefs) { return C && C->isOne(); } +bool llvm::isOneOrOneSplatFP(SDValue N, bool AllowUndefs) { + ConstantFPSDNode *C = isConstOrConstSplatFP(N, AllowUndefs); + return C && C->isExactlyValue(1.0); +} + bool llvm::isAllOnesOrAllOnesSplat(SDValue N, bool AllowUndefs) { N = peekThroughBitcasts(N); unsigned BitWidth = N.getScalarValueSizeInBits(); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 20a0efd3afa1c..3b7dadbf32124 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8081,6 +8081,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, Input, DAG.getConstant(1, sdl, Input.getValueType()))); return; } + case Intrinsic::vector_partial_reduce_fadd: { + SDValue Acc = getValue(I.getOperand(0)); + SDValue Input = getValue(I.getOperand(1)); + setValue(&I, DAG.getNode( + ISD::PARTIAL_REDUCE_FMLA, sdl, Acc.getValueType(), Acc, + Input, DAG.getConstantFP(1.0, sdl, Input.getValueType()))); + return; + } case Intrinsic::experimental_cttz_elts: { auto DL = getCurSDLoc(); SDValue Op = getValue(I.getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 39cbfad6d0be1..8495e9691c77b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -588,6 +588,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { return "partial_reduce_smla"; case ISD::PARTIAL_REDUCE_SUMLA: return "partial_reduce_sumla"; + case ISD::PARTIAL_REDUCE_FMLA: + return "partial_reduce_fmla"; case ISD::LOOP_DEPENDENCE_WAR_MASK: return "loop_dep_war"; case ISD::LOOP_DEPENDENCE_RAW_MASK: diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 920dff935daed..0d9a133000bd4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -12061,22 +12061,32 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, EVT::getVectorVT(*DAG.getContext(), AccVT.getVectorElementType(), MulOpVT.getVectorElementCount()); - unsigned ExtOpcLHS = N->getOpcode() == ISD::PARTIAL_REDUCE_UMLA - ? ISD::ZERO_EXTEND - : ISD::SIGN_EXTEND; - unsigned ExtOpcRHS = N->getOpcode() == ISD::PARTIAL_REDUCE_SMLA - ? ISD::SIGN_EXTEND - : ISD::ZERO_EXTEND; + unsigned ExtOpcLHS, ExtOpcRHS; + switch (N->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode"); + case ISD::PARTIAL_REDUCE_UMLA: + ExtOpcLHS = ExtOpcRHS = ISD::ZERO_EXTEND; + break; + case ISD::PARTIAL_REDUCE_SMLA: + ExtOpcLHS = ExtOpcRHS = ISD::SIGN_EXTEND; + break; + case ISD::PARTIAL_REDUCE_FMLA: + ExtOpcLHS = ExtOpcRHS = ISD::FP_EXTEND; + break; + } if (ExtMulOpVT != MulOpVT) { MulLHS = DAG.getNode(ExtOpcLHS, DL, ExtMulOpVT, MulLHS); MulRHS = DAG.getNode(ExtOpcRHS, DL, ExtMulOpVT, MulRHS); } SDValue Input = MulLHS; - APInt ConstantOne; - if (!ISD::isConstantSplatVector(MulRHS.getNode(), ConstantOne) || - !ConstantOne.isOne()) + if (N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA) { + if (!llvm::isOneOrOneSplatFP(MulRHS)) + Input = DAG.getNode(ISD::FMUL, DL, ExtMulOpVT, MulLHS, MulRHS); + } else if (!llvm::isOneOrOneSplat(MulRHS)) { Input = DAG.getNode(ISD::MUL, DL, ExtMulOpVT, MulLHS, MulRHS); + } unsigned Stride = AccVT.getVectorMinNumElements(); unsigned ScaleFactor = MulOpVT.getVectorMinNumElements() / Stride; @@ -12086,10 +12096,13 @@ SDValue TargetLowering::expandPartialReduceMLA(SDNode *N, for (unsigned I = 0; I < ScaleFactor; I++) Subvectors.push_back(DAG.getExtractSubvector(DL, AccVT, Input, I * Stride)); + unsigned FlatNode = + N->getOpcode() == ISD::PARTIAL_REDUCE_FMLA ? ISD::FADD : ISD::ADD; + // Flatten the subvector tree while (Subvectors.size() > 1) { Subvectors.push_back( - DAG.getNode(ISD::ADD, DL, AccVT, {Subvectors[0], Subvectors[1]})); + DAG.getNode(FlatNode, DL, AccVT, {Subvectors[0], Subvectors[1]})); Subvectors.pop_front(); Subvectors.pop_front(); } diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 03da1547b652f..7efba21238458 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6578,6 +6578,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) { } break; } + case Intrinsic::vector_partial_reduce_fadd: case Intrinsic::vector_partial_reduce_add: { VectorType *AccTy = cast(Call.getArgOperand(0)->getType()); VectorType *VecTy = cast(Call.getArgOperand(1)->getType()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a81de5c5adc34..abe1af23b4e40 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1921,6 +1921,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setPartialReduceMLAAction(MLAOps, MVT::nxv4i32, MVT::nxv8i16, Legal); setPartialReduceMLAAction(MLAOps, MVT::nxv8i16, MVT::nxv16i8, Legal); } + + // Handle floating-point partial reduction + if (Subtarget->hasSVE2p1() || Subtarget->hasSME2()) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, MVT::nxv4f32, + MVT::nxv8f16, Legal); + } } // Handle non-aliasing elements mask @@ -2288,6 +2294,11 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) { MVT::getVectorVT(MVT::i8, NumElts * 8), Custom); } + if (Subtarget->hasSVE2p1() && VT.getVectorElementType() == MVT::f32) { + setPartialReduceMLAAction(ISD::PARTIAL_REDUCE_FMLA, VT, + MVT::getVectorVT(MVT::f16, NumElts * 2), Custom); + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABDS, VT, Default); setOperationAction(ISD::ABDU, VT, Default); @@ -7911,6 +7922,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::PARTIAL_REDUCE_SMLA: case ISD::PARTIAL_REDUCE_UMLA: case ISD::PARTIAL_REDUCE_SUMLA: + case ISD::PARTIAL_REDUCE_FMLA: return LowerPARTIAL_REDUCE_MLA(Op, DAG); } } diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 98a128e582866..44628eacc83c0 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -375,6 +375,11 @@ def AArch64fclamp : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), node:$Zm) ]>; +def AArch64fdot : PatFrags<(ops node:$Zd, node:$Zn, node:$Zm), + [(int_aarch64_sve_fdot_x2 node:$Zd, node:$Zn, node:$Zm), + (partial_reduce_fmla node:$Zd, node:$Zn, node:$Zm) + ]>; + def SDT_AArch64FCVT : SDTypeProfile<1, 3, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>, SDTCisSameAs<0,3> @@ -4251,7 +4256,7 @@ defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>; let Predicates = [HasSVE2p1_or_SME2] in { defm FCLAMP_ZZZ : sve_fp_clamp<"fclamp", AArch64fclamp>; -defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fdot_x2>; +defm FDOT_ZZZ_S : sve_float_dot<0b0, 0b0, ZPR32, ZPR16, "fdot", nxv8f16, AArch64fdot>; defm FDOT_ZZZI_S : sve_float_dot_indexed<0b0, 0b00, ZPR16, ZPR3b16, "fdot", nxv8f16, int_aarch64_sve_fdot_lane_x2>; defm BFMLSLB_ZZZ_S : sve2_fp_mla_long<0b110, "bfmlslb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlslb>; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e3370d31a0e39..497d4fb4d3392 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5640,7 +5640,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( (!ST->isNeonAvailable() || !ST->hasDotProd())) return Invalid; - if ((Opcode != Instruction::Add && Opcode != Instruction::Sub) || + if ((Opcode != Instruction::Add && Opcode != Instruction::Sub && + Opcode != Instruction::FAdd) || OpAExtend == TTI::PR_None) return Invalid; @@ -5650,7 +5651,8 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( // We only support multiply binary operations for now, and for muls we // require the types being extended to be the same. - if (BinOp && (*BinOp != Instruction::Mul || InputTypeA != InputTypeB)) + if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) || + InputTypeA != InputTypeB)) return Invalid; bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index adf27bed3d749..129083eb0eb27 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7967,7 +7967,8 @@ bool VPRecipeBuilder::getScaledReductions( continue; } Value *ExtOp; - if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) + if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))) && + !match(OpI, m_FPExt(m_Value(ExtOp)))) return false; Exts[I] = cast(OpI); @@ -8138,6 +8139,8 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, return nullptr; unsigned ReductionOpcode = Reduction->getOpcode(); + if (ReductionOpcode == Instruction::FAdd && !Reduction->hasAllowReassoc()) + return nullptr; if (ReductionOpcode == Instruction::Sub) { auto *const Zero = ConstantInt::get(Reduction->getType(), 0); SmallVector Ops; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 1f1b42bb9c19f..cd08f95254de5 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -322,6 +322,8 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, return TTI::PR_ZeroExtend; if (WidenCastR->getOpcode() == Instruction::CastOps::SExt) return TTI::PR_SignExtend; + if (WidenCastR->getOpcode() == Instruction::CastOps::FPExt) + return TTI::PR_FPExtend; return TTI::PR_None; }; @@ -374,8 +376,9 @@ VPPartialReductionRecipe::computeCost(ElementCount VF, void VPPartialReductionRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; - assert(getOpcode() == Instruction::Add && - "Unhandled partial reduction opcode"); + assert( + (getOpcode() == Instruction::Add || getOpcode() == Instruction::FAdd) && + "Unhandled partial reduction opcode"); Value *BinOpVal = State.get(getOperand(1)); Value *PhiVal = State.get(getOperand(0)); @@ -383,9 +386,20 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) { Type *RetTy = PhiVal->getType(); - CallInst *V = - Builder.CreateIntrinsic(RetTy, Intrinsic::vector_partial_reduce_add, - {PhiVal, BinOpVal}, nullptr, "partial.reduce"); + enum llvm::Intrinsic::IndependentIntrinsics PRIntrinsic; + switch (getOpcode()) { + case Instruction::Add: { + PRIntrinsic = Intrinsic::vector_partial_reduce_add; + break; + } + case Instruction::FAdd: { + PRIntrinsic = Intrinsic::vector_partial_reduce_fadd; + break; + } + } + + CallInst *V = Builder.CreateIntrinsic(RetTy, PRIntrinsic, {PhiVal, BinOpVal}, + nullptr, "partial.reduce"); State.set(this, V); } diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll new file mode 100644 index 0000000000000..c055940768dfa --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1 +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1 + +define @fdot_wide_nxv4f32( %acc, %a, %b) { +; SVE2-LABEL: fdot_wide_nxv4f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: uunpklo z3.s, z1.h +; SVE2-NEXT: uunpklo z4.s, z2.h +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: uunpkhi z1.s, z1.h +; SVE2-NEXT: uunpkhi z2.s, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fcvt z4.s, p0/m, z4.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fmul z3.s, z3.s, z4.s +; SVE2-NEXT: fmul z1.s, z1.s, z2.s +; SVE2-NEXT: fadd z0.s, z0.s, z3.s +; SVE2-NEXT: fadd z0.s, z0.s, z1.s +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_nxv4f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: ret +entry: + %a.wide = fpext %a to + %b.wide = fpext %b to + %mult = fmul %a.wide, %b.wide + %partial.reduce = call @llvm.vector.partial.reduce.fadd( %acc, %mult) + ret %partial.reduce +} + +define @fdot_splat_nxv4f32( %acc, %a) { +; SVE2-LABEL: fdot_splat_nxv4f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: uunpklo z2.s, z1.h +; SVE2-NEXT: ptrue p0.s +; SVE2-NEXT: uunpkhi z1.s, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fadd z0.s, z0.s, z2.s +; SVE2-NEXT: fadd z0.s, z0.s, z1.s +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_splat_nxv4f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: fmov z2.h, #1.00000000 +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: ret +entry: + %a.wide = fpext %a to + %partial.reduce = call @llvm.vector.partial.reduce.fadd( %acc, %a.wide) + ret %partial.reduce +} + +define @partial_reduce_nxv8f16( %acc, %a) { +; CHECK-LABEL: partial_reduce_nxv8f16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd z0.h, z0.h, z1.h +; CHECK-NEXT: fadd z0.h, z0.h, z2.h +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.vector.partial.reduce.fadd( %acc, %a) + ret %partial.reduce +} + +define @partial_reduce_nxv4f32( %acc, %a) { +; CHECK-LABEL: partial_reduce_nxv4f32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd z0.s, z0.s, z1.s +; CHECK-NEXT: fadd z0.s, z0.s, z2.s +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.vector.partial.reduce.fadd( %acc, %a) + ret %partial.reduce +} + +define @partial_reduce_nxv2f64( %acc, %a) { +; CHECK-LABEL: partial_reduce_nxv2f64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd z0.d, z0.d, z1.d +; CHECK-NEXT: fadd z0.d, z0.d, z2.d +; CHECK-NEXT: ret +entry: + %partial.reduce = call @llvm.vector.partial.reduce.fadd( %acc, %a) + ret %partial.reduce +} diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll new file mode 100644 index 0000000000000..b07b571413881 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll @@ -0,0 +1,230 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1 +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=aarch64-linux-gnu -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1 + +define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) { +; SVE2-LABEL: fdot_wide_v8f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl8 +; SVE2-NEXT: mov x8, #8 // =0x8 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v8f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl8 +; SVE2P1-NEXT: ptrue p1.h, vl16 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <8 x float>, ptr %accptr + %a = load <16 x half>, ptr %aptr + %b = load <16 x half>, ptr %bptr + %a.wide = fpext <16 x half> %a to <16 x float> + %b.wide = fpext <16 x half> %b to <16 x float> + %mult = fmul <16 x float> %a.wide, %b.wide + %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult) + store <8 x float> %partial.reduce, ptr %accptr + ret void +} + +define void @fdot_wide_v16f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(4,0) { +; SVE2-LABEL: fdot_wide_v16f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl16 +; SVE2-NEXT: mov x8, #16 // =0x10 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v16f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl16 +; SVE2P1-NEXT: ptrue p1.h, vl32 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <16 x float>, ptr %accptr + %a = load <32 x half>, ptr %aptr + %b = load <32 x half>, ptr %bptr + %a.wide = fpext <32 x half> %a to <32 x float> + %b.wide = fpext <32 x half> %b to <32 x float> + %mult = fmul <32 x float> %a.wide, %b.wide + %partial.reduce = call <16 x float> @llvm.vector.partial.reduce.fadd(<16 x float> %acc, <32 x float> %mult) + store <16 x float> %partial.reduce, ptr %accptr + ret void +} + +define void @fdot_wide_v32f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(8,0) { +; SVE2-LABEL: fdot_wide_v32f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl32 +; SVE2-NEXT: mov x8, #32 // =0x20 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v32f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl32 +; SVE2P1-NEXT: ptrue p1.h, vl64 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <32 x float>, ptr %accptr + %a = load <64 x half>, ptr %aptr + %b = load <64 x half>, ptr %bptr + %a.wide = fpext <64 x half> %a to <64 x float> + %b.wide = fpext <64 x half> %b to <64 x float> + %mult = fmul <64 x float> %a.wide, %b.wide + %partial.reduce = call <32 x float> @llvm.vector.partial.reduce.fadd(<32 x float> %acc, <64 x float> %mult) + store <32 x float> %partial.reduce, ptr %accptr + ret void +} + +define void @fdot_wide_v64f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(16,0) { +; SVE2-LABEL: fdot_wide_v64f32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: ptrue p0.s, vl64 +; SVE2-NEXT: mov x8, #64 // =0x40 +; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1] +; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2] +; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1] +; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1] +; SVE2-NEXT: fcvt z0.s, p0/m, z0.h +; SVE2-NEXT: fcvt z1.s, p0/m, z1.h +; SVE2-NEXT: fcvt z2.s, p0/m, z2.h +; SVE2-NEXT: fcvt z3.s, p0/m, z3.h +; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0] +; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s +; SVE2-NEXT: st1w { z0.s }, p0, [x0] +; SVE2-NEXT: ret +; +; SVE2P1-LABEL: fdot_wide_v64f32: +; SVE2P1: // %bb.0: // %entry +; SVE2P1-NEXT: ptrue p0.s, vl64 +; SVE2P1-NEXT: ptrue p1.h, vl128 +; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0] +; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1] +; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2] +; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h +; SVE2P1-NEXT: st1w { z0.s }, p0, [x0] +; SVE2P1-NEXT: ret +entry: + %acc = load <64 x float>, ptr %accptr + %a = load <128 x half>, ptr %aptr + %b = load <128 x half>, ptr %bptr + %a.wide = fpext <128 x half> %a to <128 x float> + %b.wide = fpext <128 x half> %b to <128 x float> + %mult = fmul <128 x float> %a.wide, %b.wide + %partial.reduce = call <64 x float> @llvm.vector.partial.reduce.fadd(<64 x float> %acc, <128 x float> %mult) + store <64 x float> %partial.reduce, ptr %accptr + ret void +} + +define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: fixed_fdot_wide: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fcvtl v3.4s, v1.4h +; CHECK-NEXT: fcvtl v4.4s, v2.4h +; CHECK-NEXT: fcvtl2 v1.4s, v1.8h +; CHECK-NEXT: fcvtl2 v2.4s, v2.8h +; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s +; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v3.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret +entry: + %a.wide = fpext <8 x half> %a to <8 x float> + %b.wide = fpext <8 x half> %b to <8 x float> + %mult = fmul <8 x float> %a.wide, %b.wide + %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult) + ret <4 x float> %partial.reduce +} + +define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) { +; CHECK-LABEL: partial_reduce_half: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h +; CHECK-NEXT: fadd v0.8h, v0.8h, v2.8h +; CHECK-NEXT: ret +entry: + %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a) + ret <8 x half> %partial.reduce +} + +define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) { +; CHECK-LABEL: partial_reduce_float: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s +; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s +; CHECK-NEXT: ret +entry: + %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a) + ret <4 x float> %partial.reduce +} + +define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) { +; CHECK-LABEL: partial_reduce_double: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d +; CHECK-NEXT: fadd v0.2d, v0.2d, v2.2d +; CHECK-NEXT: ret +entry: + %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a) + ret <2 x double> %partial.reduce +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll new file mode 100644 index 0000000000000..f748ec2744ee8 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-fdot-product.ll @@ -0,0 +1,1101 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVE1 +; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-INTERLEAVED +; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -vectorizer-maximize-bandwidth -enable-epilogue-vectorization=false -mattr=+sve2p1,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-MAXBW + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define float @fdotp(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define float @fdotp( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x half>, ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD1]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: ret float [[TMP7]] +; +; CHECK-INTERLEAVED-LABEL: define float @fdotp( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[TMP0]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr half, ptr [[TMP4]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x half>, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x half>, ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = fpext <8 x half> [[WIDE_LOAD3]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = fpext <8 x half> [[WIDE_LOAD4]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fmul <8 x float> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI]], <8 x float> [[TMP8]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x float> @llvm.vector.partial.reduce.fadd.v4f32.v8f32(<4 x float> [[VEC_PHI1]], <8 x float> [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: ret float [[TMP11]] +; +; CHECK-MAXBW-LABEL: define float @fdotp( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP4]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP6]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 1 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.fadd.nxv4f32.nxv8f32( [[VEC_PHI]], [[TMP8]]) +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 1 + %ext.a = fpext half %load.a to float + %gep.b = getelementptr half, ptr %b, i64 %iv + %load.b = load half, ptr %gep.b, align 1 + %ext.b = fpext half %load.b to float + %mul = fmul float %ext.b, %ext.a + %add = fadd reassoc float %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret float %add +} + +define double @not_fdotp_different_types(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define double @not_fdotp_different_types( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x double> [ , %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double> +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = fpext <8 x float> [[WIDE_LOAD1]] to <8 x double> +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = fmul <8 x double> [[TMP3]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = fadd reassoc <8 x double> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 8 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: ret double [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define double @not_fdotp_different_types( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x double> [ , %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x double> [ splat (double -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr half, ptr [[GEP_A]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[GEP_B]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x float>, ptr [[GEP_B]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP5]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = fpext <8 x float> [[WIDE_LOAD3]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = fpext <8 x float> [[WIDE_LOAD4]] to <8 x double> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fmul <8 x double> [[TMP6]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = fmul <8 x double> [[TMP7]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP10]] = fadd reassoc <8 x double> [[TMP8]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP11]] = fadd reassoc <8 x double> [[TMP9]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], 16 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <8 x double> [[TMP11]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = call reassoc double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: ret double [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define double @not_fdotp_different_types( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (double -0.000000e+00), double 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 2 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP9]] = fadd reassoc [[TMP8]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call reassoc double @llvm.vector.reduce.fadd.nxv8f64(double -0.000000e+00, [[TMP9]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi double [ [[TMP11]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi double [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to double +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr float, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load float, ptr [[GEP_B]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext float [[LOAD_B]] to double +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul double [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc double [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi double [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP11]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret double [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi double [ 0.0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 1 + %ext.a = fpext half %load.a to double + %gep.b = getelementptr float, ptr %b, i64 %iv + %load.b = load float, ptr %gep.b, align 2 + %ext.b = fpext float %load.b to double + %mul = fmul double %ext.b, %ext.a + %add = fadd reassoc double %mul, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret double %add +} + +define float @not_fdotp_not_phi(ptr %a, ptr %b) #0 { +; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_not_phi( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = fadd reassoc [[TMP8]], [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP9]], i32 [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVE1: [[SCALAR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[FOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_not_phi( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr half, ptr [[TMP4]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[TMP9]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = fmul [[TMP13]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = fadd reassoc [[TMP14]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nuw i32 [[TMP17]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = sub i32 [[TMP18]], 1 +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP15]], i32 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVED: [[SCALAR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVED: [[FOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: ret float [[ADD_LCSSA]] +; +; CHECK-MAXBW-LABEL: define float @not_fdotp_not_phi( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr half, ptr [[B]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = fpext [[WIDE_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = fmul [[TMP7]], [[TMP5]] +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = fadd reassoc [[TMP8]], [[TMP7]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP11]], 8 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sub i32 [[TMP12]], 1 +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement [[TMP9]], i32 [[TMP13]] +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi float [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[ENTRY]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[B]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[EXT_B]], [[EXT_A]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: ret float [[ADD_LCSSA]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 1 + %ext.a = fpext half %load.a to float + %gep.b = getelementptr half, ptr %b, i64 %iv + %load.b = load half, ptr %gep.b, align 1 + %ext.b = fpext half %load.b to float + %mul = fmul float %ext.b, %ext.a + %add = fadd reassoc float %mul, %ext.b + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret float %add +} + +define void @not_fdotp_not_phi2(ptr %matrix, i32 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define void @not_fdotp_not_phi2( +; CHECK-INTERLEAVE1-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-INTERLEAVE1: [[FOR_PREHEADER]]: +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load half, ptr null, align 1 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVE1-NEXT: [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVE1-NEXT: [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]] +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3 +; CHECK-INTERLEAVE1-NEXT: [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]] +; CHECK-INTERLEAVE1-NEXT: [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVE1: [[SCALAR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[FOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVE1-NEXT: [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVE1-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVE1-NEXT: [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVE1-NEXT: [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVE1: [[FOR_EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVE1: [[EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ] +; CHECK-INTERLEAVE1-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVE1-NEXT: ret void +; +; CHECK-INTERLEAVED-LABEL: define void @not_fdotp_not_phi2( +; CHECK-INTERLEAVED-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_PREHEADER]]: +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load half, ptr null, align 1 +; CHECK-INTERLEAVED-NEXT: [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-INTERLEAVED-NEXT: [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVED-NEXT: [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP70:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP71:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64 +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 128 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 160 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX]], 192 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 224 +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = getelementptr half, ptr [[NEXT_GEP6]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = getelementptr half, ptr [[NEXT_GEP7]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = getelementptr half, ptr [[NEXT_GEP8]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = getelementptr half, ptr [[NEXT_GEP9]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr half, ptr [[NEXT_GEP10]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = load half, ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = load half, ptr [[TMP13]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = load half, ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = load half, ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = insertelement <4 x half> poison, half [[TMP28]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = insertelement <4 x half> [[TMP34]], half [[TMP31]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = load half, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = load half, ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = load half, ptr [[TMP18]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = load half, ptr [[TMP19]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = insertelement <4 x half> poison, half [[TMP36]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = insertelement <4 x half> [[TMP40]], half [[TMP37]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = insertelement <4 x half> [[TMP41]], half [[TMP38]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = insertelement <4 x half> [[TMP42]], half [[TMP39]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = fpext <4 x half> [[TMP35]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = fpext <4 x half> [[TMP43]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP44]] +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP45]] +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = fadd reassoc <4 x float> [[TMP46]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = fadd reassoc <4 x float> [[TMP47]], [[VEC_PHI3]] +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = load half, ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = load half, ptr [[TMP21]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = load half, ptr [[TMP22]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = load half, ptr [[TMP23]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = insertelement <4 x half> poison, half [[TMP50]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = insertelement <4 x half> [[TMP54]], half [[TMP51]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = insertelement <4 x half> [[TMP55]], half [[TMP52]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = insertelement <4 x half> [[TMP56]], half [[TMP53]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = load half, ptr [[TMP24]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = load half, ptr [[TMP25]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = load half, ptr [[TMP26]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = load half, ptr [[TMP27]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = insertelement <4 x half> poison, half [[TMP58]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = insertelement <4 x half> [[TMP62]], half [[TMP59]], i32 1 +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = insertelement <4 x half> [[TMP63]], half [[TMP60]], i32 2 +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = insertelement <4 x half> [[TMP64]], half [[TMP61]], i32 3 +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = fpext <4 x half> [[TMP57]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = fpext <4 x half> [[TMP65]] to <4 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP68:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP66]] +; CHECK-INTERLEAVED-NEXT: [[TMP69:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP67]] +; CHECK-INTERLEAVED-NEXT: [[TMP70]] = fadd reassoc <4 x float> [[TMP68]], [[TMP48]] +; CHECK-INTERLEAVED-NEXT: [[TMP71]] = fadd reassoc <4 x float> [[TMP69]], [[TMP49]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP72:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP72]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <4 x float> [[TMP71]], [[TMP70]] +; CHECK-INTERLEAVED-NEXT: [[TMP73:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVED: [[SCALAR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL11:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP73]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVED: [[FOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL11]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1 +; CHECK-INTERLEAVED-NEXT: [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2 +; CHECK-INTERLEAVED-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]] +; CHECK-INTERLEAVED-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]] +; CHECK-INTERLEAVED-NEXT: [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]] +; CHECK-INTERLEAVED-NEXT: [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP73]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVED: [[EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ] +; CHECK-INTERLEAVED-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-INTERLEAVED-NEXT: ret void +; +; CHECK-MAXBW-LABEL: define void @not_fdotp_not_phi2( +; CHECK-MAXBW-SAME: ptr [[MATRIX:%.*]], i32 [[N:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label %[[FOR_PREHEADER:.*]], label %[[EXIT:.*]] +; CHECK-MAXBW: [[FOR_PREHEADER]]: +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr null, align 1 +; CHECK-MAXBW-NEXT: [[LOAD_A1:%.*]] = load half, ptr inttoptr (i64 1 to ptr), align 1 +; CHECK-MAXBW-NEXT: [[A_EXT:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[A_EXT1:%.*]] = fpext half [[LOAD_A1]] to float +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = trunc i64 [[N_VEC]] to i32 +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul i64 [[N_VEC]], 32 +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP2]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[A_EXT]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x float> poison, float [[A_EXT1]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT1]], <4 x float> poison, <4 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , %[[VECTOR_PH]] ], [ [[TMP37:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 32 +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 32 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 64 +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 96 +; CHECK-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP4]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP5]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP6]] +; CHECK-MAXBW-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[MATRIX]], i64 [[TMP7]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP10:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 1 +; CHECK-MAXBW-NEXT: [[TMP12:%.*]] = getelementptr half, ptr [[NEXT_GEP]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = getelementptr half, ptr [[NEXT_GEP3]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr half, ptr [[NEXT_GEP4]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP15:%.*]] = getelementptr half, ptr [[NEXT_GEP5]], i64 2 +; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = load half, ptr [[TMP8]], align 1 +; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = load half, ptr [[TMP9]], align 1 +; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = load half, ptr [[TMP10]], align 1 +; CHECK-MAXBW-NEXT: [[TMP19:%.*]] = load half, ptr [[TMP11]], align 1 +; CHECK-MAXBW-NEXT: [[TMP20:%.*]] = insertelement <4 x half> poison, half [[TMP16]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP21:%.*]] = insertelement <4 x half> [[TMP20]], half [[TMP17]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP22:%.*]] = insertelement <4 x half> [[TMP21]], half [[TMP18]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP23:%.*]] = insertelement <4 x half> [[TMP22]], half [[TMP19]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP24:%.*]] = fpext <4 x half> [[TMP23]] to <4 x float> +; CHECK-MAXBW-NEXT: [[TMP25:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT]], [[TMP24]] +; CHECK-MAXBW-NEXT: [[TMP26:%.*]] = fadd reassoc <4 x float> [[TMP25]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[TMP27:%.*]] = load half, ptr [[TMP12]], align 1 +; CHECK-MAXBW-NEXT: [[TMP28:%.*]] = load half, ptr [[TMP13]], align 1 +; CHECK-MAXBW-NEXT: [[TMP29:%.*]] = load half, ptr [[TMP14]], align 1 +; CHECK-MAXBW-NEXT: [[TMP30:%.*]] = load half, ptr [[TMP15]], align 1 +; CHECK-MAXBW-NEXT: [[TMP31:%.*]] = insertelement <4 x half> poison, half [[TMP27]], i32 0 +; CHECK-MAXBW-NEXT: [[TMP32:%.*]] = insertelement <4 x half> [[TMP31]], half [[TMP28]], i32 1 +; CHECK-MAXBW-NEXT: [[TMP33:%.*]] = insertelement <4 x half> [[TMP32]], half [[TMP29]], i32 2 +; CHECK-MAXBW-NEXT: [[TMP34:%.*]] = insertelement <4 x half> [[TMP33]], half [[TMP30]], i32 3 +; CHECK-MAXBW-NEXT: [[TMP35:%.*]] = fpext <4 x half> [[TMP34]] to <4 x float> +; CHECK-MAXBW-NEXT: [[TMP36:%.*]] = fmul <4 x float> [[BROADCAST_SPLAT2]], [[TMP35]] +; CHECK-MAXBW-NEXT: [[TMP37]] = fadd reassoc <4 x float> [[TMP36]], [[TMP26]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-MAXBW-NEXT: [[TMP38:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP38]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP39:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP37]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[FOR_EXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP1]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL6:%.*]] = phi ptr [ [[TMP3]], %[[MIDDLE_BLOCK]] ], [ [[MATRIX]], %[[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP39]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PREHEADER]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[PTR:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL6]], %[[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[ADD_1:%.*]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ] +; CHECK-MAXBW-NEXT: [[GEP_B:%.*]] = getelementptr half, ptr [[PTR]], i64 1 +; CHECK-MAXBW-NEXT: [[GEP_B1:%.*]] = getelementptr half, ptr [[PTR]], i64 2 +; CHECK-MAXBW-NEXT: [[LOAD_B:%.*]] = load half, ptr [[GEP_B]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT:%.*]] = fpext half [[LOAD_B]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[A_EXT]], [[B_EXT]] +; CHECK-MAXBW-NEXT: [[ADD:%.*]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[LOAD_B1:%.*]] = load half, ptr [[GEP_B1]], align 1 +; CHECK-MAXBW-NEXT: [[B_EXT1:%.*]] = fpext half [[LOAD_B1]] to float +; CHECK-MAXBW-NEXT: [[MUL_1:%.*]] = fmul float [[A_EXT1]], [[B_EXT1]] +; CHECK-MAXBW-NEXT: [[ADD_1]] = fadd reassoc float [[MUL_1]], [[ADD]] +; CHECK-MAXBW-NEXT: [[SCEVGEP]] = getelementptr half, ptr [[PTR]], i64 16 +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-MAXBW: [[FOR_EXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_FLOAT:%.*]] = phi float [ [[ADD_1]], %[[FOR_BODY]] ], [ [[TMP39]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label %[[EXIT]] +; CHECK-MAXBW: [[EXIT]]: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_FLOAT]], %[[FOR_EXIT]] ] +; CHECK-MAXBW-NEXT: store float [[RESULT]], ptr [[MATRIX]], align 4 +; CHECK-MAXBW-NEXT: ret void +; +entry: + %cmp = icmp sgt i32 %n, 0 + br i1 %cmp, label %for.preheader, label %exit + +for.preheader: ; preds = %entry + %load.a = load half, ptr inttoptr (i64 0 to ptr), align 1 + %load.a1 = load half, ptr inttoptr (i64 1 to ptr), align 1 + %a.ext = fpext half %load.a to float + %a.ext1 = fpext half %load.a1 to float + br label %for.body + +for.body: ; preds = %for.preheader, %for.body + %iv = phi i32 [ %iv.next, %for.body ], [ 0, %for.preheader ] + %ptr = phi ptr [ %scevgep, %for.body ], [ %matrix, %for.preheader ] + %accum = phi float [ %add.1, %for.body ], [ 0.0, %for.preheader ] + %gep.b = getelementptr half, ptr %ptr, i64 1 + %gep.b1 = getelementptr half, ptr %ptr, i64 2 + %load.b = load half, ptr %gep.b, align 1 + %b.ext = fpext half %load.b to float + %mul = fmul float %a.ext, %b.ext + %add = fadd reassoc float %mul, %accum + %load.b1 = load half, ptr %gep.b1, align 1 + %b.ext1 = fpext half %load.b1 to float + %mul.1 = fmul float %a.ext1, %b.ext1 + %add.1 = fadd reassoc float %mul.1, %add + %scevgep = getelementptr half, ptr %ptr, i64 16 + %iv.next = add nuw nsw i32 %iv, 1 + %exitcond.not = icmp eq i32 %iv.next, %n + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + %add.float = phi float [ %add.1, %for.body ] + br label %exit + +exit: ; preds = %for.exit, %entry + %result = phi float [ 0.000000e+00, %entry ], [ %add.float, %for.exit ] + store float %result, ptr %matrix, align 4 + ret void +} + +define float @not_fdotp_ext_outside_plan(ptr %a, half %b, i64 %n) #0 { +; CHECK-INTERLEAVE1-LABEL: define float @not_fdotp_ext_outside_plan( +; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]] +; CHECK-INTERLEAVE1: [[FOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[EXT_B:%.*]] = fpext half [[B]] to float +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[VECTOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = fmul <8 x float> [[TMP1]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3]] = fadd reassoc <8 x float> [[TMP2]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVE1: [[SCALAR_PH]]: +; CHECK-INTERLEAVE1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP5]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVE1: [[FOR_BODY]]: +; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVE1-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVE1-NEXT: [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVE1-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVE1-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1: [[EXIT_LOOPEXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP5]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVE1-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVE1: [[EXIT]]: +; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVE1-NEXT: ret float [[RESULT]] +; +; CHECK-INTERLEAVED-LABEL: define float @not_fdotp_ext_outside_plan( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*]]: +; CHECK-INTERLEAVED-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]] +; CHECK-INTERLEAVED: [[FOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[EXT_B:%.*]] = fpext half [[B]] to float +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x float> poison, float [[EXT_B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x float> [[BROADCAST_SPLATINSERT]], <8 x float> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x float> [ , %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x float> [ splat (float -0.000000e+00), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds half, ptr [[TMP0]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x half>, ptr [[TMP0]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x half>, ptr [[TMP1]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = fpext <8 x half> [[WIDE_LOAD]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = fpext <8 x half> [[WIDE_LOAD2]] to <8 x float> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = fmul <8 x float> [[TMP2]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = fmul <8 x float> [[TMP3]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = fadd reassoc <8 x float> [[TMP4]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP7]] = fadd reassoc <8 x float> [[TMP5]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = fadd reassoc <8 x float> [[TMP7]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.v8f32(float -0.000000e+00, <8 x float> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-INTERLEAVED: [[SCALAR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-INTERLEAVED: [[FOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] +; CHECK-INTERLEAVED-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]] +; CHECK-INTERLEAVED-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-INTERLEAVED-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-INTERLEAVED-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED: [[EXIT_LOOPEXIT]]: +; CHECK-INTERLEAVED-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-INTERLEAVED-NEXT: br label %[[EXIT]] +; CHECK-INTERLEAVED: [[EXIT]]: +; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-INTERLEAVED-NEXT: ret float [[RESULT]] +; +; CHECK-MAXBW-LABEL: define float @not_fdotp_ext_outside_plan( +; CHECK-MAXBW-SAME: ptr [[A:%.*]], half [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: [[ENTRY:.*]]: +; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i64 [[N]], 0 +; CHECK-MAXBW-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[FOR_PH:.*]] +; CHECK-MAXBW: [[FOR_PH]]: +; CHECK-MAXBW-NEXT: [[EXT_B:%.*]] = fpext half [[B]] to float +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 3 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]] +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK-MAXBW: [[VECTOR_PH]]: +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, float [[EXT_B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-MAXBW-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-MAXBW: [[VECTOR_BODY]]: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ insertelement ( splat (float -0.000000e+00), float 0.000000e+00, i32 0), %[[VECTOR_PH]] ], [ [[TMP7:%.*]], %[[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[INDEX]] +; CHECK-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 2 +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = fpext [[WIDE_LOAD]] to +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = fmul [[TMP5]], [[BROADCAST_SPLAT]] +; CHECK-MAXBW-NEXT: [[TMP7]] = fadd reassoc [[TMP6]], [[VEC_PHI]] +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-MAXBW: [[MIDDLE_BLOCK]]: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call reassoc float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, [[TMP7]]) +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]] +; CHECK-MAXBW: [[SCALAR_PH]]: +; CHECK-MAXBW-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_PH]] ] +; CHECK-MAXBW-NEXT: [[BC_MERGE_RDX:%.*]] = phi float [ [[TMP9]], %[[MIDDLE_BLOCK]] ], [ 0.000000e+00, %[[FOR_PH]] ] +; CHECK-MAXBW-NEXT: br label %[[FOR_BODY:.*]] +; CHECK-MAXBW: [[FOR_BODY]]: +; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[ACCUM:%.*]] = phi float [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[GEP_A:%.*]] = getelementptr inbounds half, ptr [[A]], i64 [[IV]] +; CHECK-MAXBW-NEXT: [[LOAD_A:%.*]] = load half, ptr [[GEP_A]], align 2 +; CHECK-MAXBW-NEXT: [[EXT_A:%.*]] = fpext half [[LOAD_A]] to float +; CHECK-MAXBW-NEXT: [[MUL:%.*]] = fmul float [[EXT_A]], [[EXT_B]] +; CHECK-MAXBW-NEXT: [[ADD]] = fadd reassoc float [[MUL]], [[ACCUM]] +; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; CHECK-MAXBW-NEXT: [[CMP_1:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_1]], label %[[EXIT_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW: [[EXIT_LOOPEXIT]]: +; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi float [ [[ADD]], %[[FOR_BODY]] ], [ [[TMP9]], %[[MIDDLE_BLOCK]] ] +; CHECK-MAXBW-NEXT: br label %[[EXIT]] +; CHECK-MAXBW: [[EXIT]]: +; CHECK-MAXBW-NEXT: [[RESULT:%.*]] = phi float [ 0.000000e+00, %[[ENTRY]] ], [ [[ADD_LCSSA]], %[[EXIT_LOOPEXIT]] ] +; CHECK-MAXBW-NEXT: ret float [[RESULT]] +; +entry: + %cmp = icmp eq i64 %n, 0 + br i1 %cmp, label %exit, label %for.ph + +for.ph: ; preds = %entry + %ext.b = fpext half %b to float + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %iv = phi i64 [ 0, %for.ph ], [ %iv.next, %for.body ] + %accum = phi float [ 0.0, %for.ph ], [ %add, %for.body ] + %gep.a = getelementptr inbounds half, ptr %a, i64 %iv + %load.a = load half, ptr %gep.a, align 2 + %ext.a = fpext half %load.a to float + %mul = fmul float %ext.a, %ext.b + %add = fadd reassoc float %mul, %accum + %iv.next = add nuw nsw i64 %iv, 1 + %cmp.1 = icmp eq i64 %iv.next, %n + br i1 %cmp.1, label %exit, label %for.body + +exit: ; preds = %for.cond.cleanup.loopexit, %entry + %result = phi float [ 0.0, %entry ], [ %add, %for.body ] + ret float %result +} + + +;. +; CHECK-INTERLEAVE1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVE1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVE1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVE1: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVE1: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVE1: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +;. +; CHECK-INTERLEAVED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-INTERLEAVED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-INTERLEAVED: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-INTERLEAVED: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-INTERLEAVED: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-INTERLEAVED: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +;. +; CHECK-MAXBW: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-MAXBW: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-MAXBW: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-MAXBW: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]} +; CHECK-MAXBW: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]} +; CHECK-MAXBW: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]} +;.