diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 1aebcc4439964..df2201d225d07 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -24105,6 +24105,130 @@ Examples: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> poison) +.. _int_loop_dependence_war_mask: + +'``llvm.loop.dependence.war.mask.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare @llvm.loop.dependence.war.mask.nxv16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + + +Overview: +""""""""" + +Given a vector load from %ptrA followed by a vector store to %ptrB, this +instruction generates a mask where an active lane indicates that the +write-after-read sequence can be performed safely for that lane, without the +danger of a write-after-read hazard occurring. + +A write-after-read hazard occurs when a write-after-read sequence for a given +lane in a vector ends up being executed as a read-after-write sequence due to +the aliasing of pointers. + +Arguments: +"""""""""" + +The first two arguments are pointers and the last argument is an immediate. +The result is a vector with the i1 element type. + +Semantics: +"""""""""" + +``%elementSize`` is the size of the accessed elements in bytes. +The intrinsic returns ``poison`` if the distance between ``%prtA`` and ``%ptrB`` +is smaller than ``VF * %elementsize`` and either ``%ptrA + VF * %elementSize`` +or ``%ptrB + VF * %elementSize`` wrap. +The element of the result mask is active when loading from %ptrA then storing to +%ptrB is safe and doesn't result in a write-after-read hazard, meaning that: + +* (ptrB - ptrA) <= 0 (guarantees that all lanes are loaded before any stores), or +* (ptrB - ptrA) >= elementSize * lane (guarantees that this lane is loaded + before the store to the same address) + +Examples: +""""""""" + +.. code-block:: llvm + + %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4) + %vecA = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr %ptrA, i32 4, <4 x i1> %loop.dependence.mask, <4 x i32> poison) + [...] + call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr %ptrB, i32 4, <4 x i1> %loop.dependence.mask) + +.. _int_loop_dependence_raw_mask: + +'``llvm.loop.dependence.raw.mask.*``' Intrinsics +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + declare @llvm.loop.dependence.raw.mask.nxv16i1(ptr %ptrA, ptr %ptrB, i64 immarg %elementSize) + + +Overview: +""""""""" + +Given a vector store to %ptrA followed by a vector load from %ptrB, this +instruction generates a mask where an active lane indicates that the +read-after-write sequence can be performed safely for that lane, without a +read-after-write hazard or a store-to-load forwarding hazard being introduced. + +A read-after-write hazard occurs when a read-after-write sequence for a given +lane in a vector ends up being executed as a write-after-read sequence due to +the aliasing of pointers. + +A store-to-load forwarding hazard occurs when a vector store writes to an +address that partially overlaps with the address of a subsequent vector load, +meaning that the vector load can't be performed until the vector store is +complete. + +Arguments: +"""""""""" + +The first two arguments are pointers and the last argument is an immediate. +The result is a vector with the i1 element type. + +Semantics: +"""""""""" + +``%elementSize`` is the size of the accessed elements in bytes. +The intrinsic returns ``poison`` if the distance between ``%prtA`` and ``%ptrB`` +is smaller than ``VF * %elementsize`` and either ``%ptrA + VF * %elementSize`` +or ``%ptrB + VF * %elementSize`` wrap. +The element of the result mask is active when storing to %ptrA then loading from +%ptrB is safe and doesn't result in aliasing, meaning that: + +* abs(ptrB - ptrA) >= elementSize * lane (guarantees that the store of this lane + occurs before loading from this address), or +* ptrA == ptrB (doesn't introduce any new hazards that weren't in the scalar + code) + +Examples: +""""""""" + +.. code-block:: llvm + + %loop.dependence.mask = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %ptrA, ptr %ptrB, i64 4) + call @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %vecA, ptr %ptrA, i32 4, <4 x i1> %loop.dependence.mask) + [...] + %vecB = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(ptr %ptrB, i32 4, <4 x i1> %loop.dependence.mask, <4 x i32> poison) + .. _int_experimental_vp_splice: '``llvm.experimental.vp.splice``' Intrinsic diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 465e4a0a9d0d8..c76c83d84b3c7 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -1558,6 +1558,12 @@ enum NodeType { // bits conform to getBooleanContents similar to the SETCC operator. GET_ACTIVE_LANE_MASK, + // The `llvm.loop.dependence.{war, raw}.mask` intrinsics + // Operands: Load pointer, Store pointer, Element size + // Output: Mask + LOOP_DEPENDENCE_WAR_MASK, + LOOP_DEPENDENCE_RAW_MASK, + // llvm.clear_cache intrinsic // Operands: Input Chain, Start Addres, End Address // Outputs: Output Chain diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index e0ee12391b31d..8e2e0604cb3af 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2420,6 +2420,16 @@ let IntrProperties = [IntrNoMem, ImmArg>] in { llvm_i32_ty]>; } +def int_loop_dependence_raw_mask: + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_ptr_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; + +def int_loop_dependence_war_mask: + DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_ptr_ty, llvm_ptr_ty, llvm_i64_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg>]>; + def int_get_active_lane_mask: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyint_ty, LLVMMatchType<1>], diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index a4ed62bb5715c..b4dd88a809d4e 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -833,6 +833,14 @@ def step_vector : SDNode<"ISD::STEP_VECTOR", SDTypeProfile<1, 1, def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>, []>; +def SDTLoopDepMask : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<1>, + SDTCisSameAs<2, 1>, SDTCisInt<3>, + SDTCVecEltisVT<0,i1>]>; +def loop_dependence_war_mask : SDNode<"ISD::LOOP_DEPENDENCE_WAR_MASK", + SDTLoopDepMask, []>; +def loop_dependence_raw_mask : SDNode<"ISD::LOOP_DEPENDENCE_RAW_MASK", + SDTLoopDepMask, []>; + // vector_extract/vector_insert are similar to extractelt/insertelt but allow // types that require promotion (a 16i8 extract where i8 is not a legal type so // uses i32 for example). extractelt/insertelt are preferred where the element diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp index a5bd97ace169e..922a1e1064f6c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -324,6 +324,11 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) { Res = PromoteIntRes_VP_REDUCE(N); break; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + Res = PromoteIntRes_LOOP_DEPENDENCE_MASK(N); + break; + case ISD::FREEZE: Res = PromoteIntRes_FREEZE(N); break; @@ -374,6 +379,12 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N, return GetPromotedInteger(Op); } +SDValue DAGTypeLegalizer::PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + EVT VT = N->getValueType(0); + EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + return DAG.getNode(N->getOpcode(), SDLoc(N), NewVT, N->ops()); +} + SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) { // Sign-extend the new bits, and continue the assertion. SDValue Op = SExtPromotedInteger(N->getOperand(0)); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 33fa3012618b3..fe4e409df79b1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -382,6 +382,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntRes_VECTOR_FIND_LAST_ACTIVE(SDNode *N); SDValue PromoteIntRes_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntRes_PARTIAL_REDUCE_MLA(SDNode *N); + SDValue PromoteIntRes_LOOP_DEPENDENCE_MASK(SDNode *N); // Integer Operand Promotion. bool PromoteIntegerOperand(SDNode *N, unsigned OpNo); @@ -436,6 +437,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue PromoteIntOp_VECTOR_FIND_LAST_ACTIVE(SDNode *N, unsigned OpNo); SDValue PromoteIntOp_GET_ACTIVE_LANE_MASK(SDNode *N); SDValue PromoteIntOp_PARTIAL_REDUCE_MLA(SDNode *N); + SDValue PromoteIntOp_LOOP_DEPENDENCE_MASK(SDNode *N, unsigned OpNo); void SExtOrZExtPromotedOperands(SDValue &LHS, SDValue &RHS); void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code); @@ -868,6 +870,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { // Vector Result Scalarization: <1 x ty> -> ty. void ScalarizeVectorResult(SDNode *N, unsigned ResNo); SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo); + SDValue ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N); SDValue ScalarizeVecRes_BinOp(SDNode *N); SDValue ScalarizeVecRes_CMP(SDNode *N); SDValue ScalarizeVecRes_TernaryOp(SDNode *N); @@ -963,6 +966,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { void SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi); void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi); @@ -1069,6 +1073,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue WidenVecRes_ADDRSPACECAST(SDNode *N); SDValue WidenVecRes_AssertZext(SDNode* N); SDValue WidenVecRes_BITCAST(SDNode* N); + SDValue WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N); SDValue WidenVecRes_BUILD_VECTOR(SDNode* N); SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N); SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index d2ecc13331e02..5420de97bd82d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -138,6 +138,7 @@ class VectorLegalizer { SDValue ExpandVP_FNEG(SDNode *Node); SDValue ExpandVP_FABS(SDNode *Node); SDValue ExpandVP_FCOPYSIGN(SDNode *Node); + SDValue ExpandLOOP_DEPENDENCE_MASK(SDNode *N); SDValue ExpandSELECT(SDNode *Node); std::pair ExpandLoad(SDNode *N); SDValue ExpandStore(SDNode *N); @@ -475,6 +476,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::VECTOR_COMPRESS: case ISD::SCMP: case ISD::UCMP: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); break; case ISD::SMULFIX: @@ -1291,6 +1294,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::UCMP: Results.push_back(TLI.expandCMP(Node, DAG)); return; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + Results.push_back(ExpandLOOP_DEPENDENCE_MASK(Node)); + return; case ISD::FADD: case ISD::FMUL: @@ -1796,6 +1803,50 @@ SDValue VectorLegalizer::ExpandVP_FCOPYSIGN(SDNode *Node) { return DAG.getNode(ISD::BITCAST, DL, VT, CopiedSign); } +SDValue VectorLegalizer::ExpandLOOP_DEPENDENCE_MASK(SDNode *N) { + SDLoc DL(N); + SDValue SourceValue = N->getOperand(0); + SDValue SinkValue = N->getOperand(1); + SDValue EltSize = N->getOperand(2); + + bool IsReadAfterWrite = N->getOpcode() == ISD::LOOP_DEPENDENCE_RAW_MASK; + EVT VT = N->getValueType(0); + EVT PtrVT = SourceValue->getValueType(0); + + SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue); + if (IsReadAfterWrite) + Diff = DAG.getNode(ISD::ABS, DL, PtrVT, Diff); + + Diff = DAG.getNode(ISD::SDIV, DL, PtrVT, Diff, EltSize); + + // If the difference is positive then some elements may alias + EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Diff.getValueType()); + SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT); + SDValue Cmp = DAG.getSetCC(DL, CmpVT, Diff, Zero, + IsReadAfterWrite ? ISD::SETEQ : ISD::SETLE); + + // Create the lane mask + EVT SplatVT = VT.changeElementType(PtrVT); + SDValue DiffSplat = DAG.getSplat(SplatVT, DL, Diff); + SDValue VectorStep = DAG.getStepVector(DL, SplatVT); + EVT MaskVT = VT.changeElementType(MVT::i1); + SDValue DiffMask = + DAG.getSetCC(DL, MaskVT, VectorStep, DiffSplat, ISD::CondCode::SETULT); + + EVT EltVT = VT.getVectorElementType(); + // Extend the diff setcc in case the intrinsic has been promoted to a vector + // type with elements larger than i1 + if (EltVT.getScalarSizeInBits() > MaskVT.getScalarSizeInBits()) + DiffMask = DAG.getNode(ISD::ANY_EXTEND, DL, VT, DiffMask); + + // Splat the compare result then OR it with the lane mask + if (CmpVT.getScalarSizeInBits() < EltVT.getScalarSizeInBits()) + Cmp = DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Cmp); + SDValue Splat = DAG.getSplat(VT, DL, Cmp); + return DAG.getNode(ISD::OR, DL, VT, DiffMask, Splat); +} + void VectorLegalizer::ExpandFP_TO_UINT(SDNode *Node, SmallVectorImpl &Results) { // Attempt to expand using TargetLowering. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index bc2dbfb4cbaae..0c430d5be31e6 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -53,6 +53,10 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { report_fatal_error("Do not know how to scalarize the result of this " "operator!\n"); + case ISD::LOOP_DEPENDENCE_WAR_MASK: + case ISD::LOOP_DEPENDENCE_RAW_MASK: + R = ScalarizeVecRes_LOOP_DEPENDENCE_MASK(N); + break; case ISD::MERGE_VALUES: R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break; case ISD::BITCAST: R = ScalarizeVecRes_BITCAST(N); break; case ISD::BUILD_VECTOR: R = ScalarizeVecRes_BUILD_VECTOR(N); break; @@ -396,6 +400,22 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N, return GetScalarizedVector(Op); } +SDValue DAGTypeLegalizer::ScalarizeVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + SDValue SourceValue = N->getOperand(0); + SDValue SinkValue = N->getOperand(1); + SDValue EltSize = N->getOperand(2); + EVT PtrVT = SourceValue->getValueType(0); + SDLoc DL(N); + + SDValue Diff = DAG.getNode(ISD::SUB, DL, PtrVT, SinkValue, SourceValue); + EVT CmpVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), + Diff.getValueType()); + SDValue Zero = DAG.getTargetConstant(0, DL, PtrVT); + return DAG.getNode(ISD::OR, DL, CmpVT, + DAG.getSetCC(DL, CmpVT, Diff, EltSize, ISD::SETGE), + DAG.getSetCC(DL, CmpVT, Diff, Zero, ISD::SETEQ)); +} + SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) { SDValue Op = N->getOperand(0); if (getTypeAction(Op.getValueType()) == TargetLowering::TypeScalarizeVector) @@ -1118,6 +1138,10 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { report_fatal_error("Do not know how to split the result of this " "operator!\n"); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + SplitVecRes_LOOP_DEPENDENCE_MASK(N, Lo, Hi); + break; case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::AssertZext: SplitVecRes_AssertZext(N, Lo, Hi); break; case ISD::VSELECT: @@ -1611,6 +1635,25 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi); } +void DAGTypeLegalizer::SplitVecRes_LOOP_DEPENDENCE_MASK(SDNode *N, SDValue &Lo, + SDValue &Hi) { + SDLoc DL(N); + EVT LoVT, HiVT; + std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); + SDValue PtrA = N->getOperand(0); + SDValue PtrB = N->getOperand(1); + Lo = DAG.getNode(N->getOpcode(), DL, LoVT, PtrA, PtrB, N->getOperand(2)); + + unsigned EltSize = N->getConstantOperandVal(2); + unsigned Offset = EltSize * HiVT.getVectorMinNumElements(); + SDValue Addend = HiVT.isScalableVT() + ? DAG.getVScale(DL, MVT::i64, APInt(64, Offset)) + : DAG.getConstant(Offset, DL, MVT::i64); + + PtrA = DAG.getNode(ISD::ADD, DL, MVT::i64, PtrA, Addend); + Hi = DAG.getNode(N->getOpcode(), DL, HiVT, PtrA, PtrB, N->getOperand(2)); +} + void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi) { EVT LoVT, HiVT; @@ -4711,6 +4754,10 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { #endif report_fatal_error("Do not know how to widen the result of this operator!"); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + Res = WidenVecRes_LOOP_DEPENDENCE_MASK(N); + break; case ISD::MERGE_VALUES: Res = WidenVecRes_MERGE_VALUES(N, ResNo); break; case ISD::ADDRSPACECAST: Res = WidenVecRes_ADDRSPACECAST(N); @@ -5913,6 +5960,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) { return CreateStackStoreLoad(InOp, WidenVT); } +SDValue DAGTypeLegalizer::WidenVecRes_LOOP_DEPENDENCE_MASK(SDNode *N) { + return DAG.getNode( + N->getOpcode(), SDLoc(N), + TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)), + N->getOperand(0), N->getOperand(1), N->getOperand(2)); +} + SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) { SDLoc dl(N); // Build a vector with undefined for the new nodes. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2eaab02130699..f235ce532a09d 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -8314,6 +8314,18 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, visitVectorExtractLastActive(I, Intrinsic); return; } + case Intrinsic::loop_dependence_war_mask: + setValue(&I, + DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, sdl, + EVT::getEVT(I.getType()), getValue(I.getOperand(0)), + getValue(I.getOperand(1)), getValue(I.getOperand(2)))); + return; + case Intrinsic::loop_dependence_raw_mask: + setValue(&I, + DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, sdl, + EVT::getEVT(I.getType()), getValue(I.getOperand(0)), + getValue(I.getOperand(1)), getValue(I.getOperand(2)))); + return; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 900da7645504f..4b2a00c2e2cfa 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -587,6 +587,10 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { return "partial_reduce_smla"; case ISD::PARTIAL_REDUCE_SUMLA: return "partial_reduce_sumla"; + case ISD::LOOP_DEPENDENCE_WAR_MASK: + return "loop_dep_war"; + case ISD::LOOP_DEPENDENCE_RAW_MASK: + return "loop_dep_raw"; // Vector Predication #define BEGIN_REGISTER_VP_SDNODE(SDID, LEGALARG, NAME, ...) \ diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 350948a92a3ae..ea57bef1c0701 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -900,6 +900,9 @@ void TargetLoweringBase::initActions() { // Masked vector extracts default to expand. setOperationAction(ISD::VECTOR_FIND_LAST_ACTIVE, VT, Expand); + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Expand); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Expand); + // FP environment operations default to expand. setOperationAction(ISD::GET_FPENV, VT, Expand); setOperationAction(ISD::SET_FPENV, VT, Expand); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2072e48914ae6..65a6bf431ddc1 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1915,6 +1915,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } } + // Handle non-aliasing elements mask + if (Subtarget->hasSVE2() || + (Subtarget->hasSME() && Subtarget->isStreaming())) { + // FIXME: Support wider fixed-length types when msve-vector-bits is used. + for (auto VT : {MVT::v2i32, MVT::v4i16, MVT::v8i8, MVT::v16i8}) { + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom); + } + for (auto VT : {MVT::nxv2i1, MVT::nxv4i1, MVT::nxv8i1, MVT::nxv16i1}) { + setOperationAction(ISD::LOOP_DEPENDENCE_RAW_MASK, VT, Custom); + setOperationAction(ISD::LOOP_DEPENDENCE_WAR_MASK, VT, Custom); + } + } + // Handle operations that are only available in non-streaming SVE mode. if (Subtarget->isSVEAvailable()) { for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64, @@ -5229,6 +5243,56 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, static MVT getSVEContainerType(EVT ContentTy); +SDValue +AArch64TargetLowering::LowerLOOP_DEPENDENCE_MASK(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + uint64_t EltSize = Op.getConstantOperandVal(2); + EVT VT = Op.getValueType(); + switch (EltSize) { + case 1: + if (VT != MVT::v16i8 && VT != MVT::nxv16i1) + return SDValue(); + break; + case 2: + if (VT != MVT::v8i8 && VT != MVT::nxv8i1) + return SDValue(); + break; + case 4: + if (VT != MVT::v4i16 && VT != MVT::nxv4i1) + return SDValue(); + break; + case 8: + if (VT != MVT::v2i32 && VT != MVT::nxv2i1) + return SDValue(); + break; + default: + // Other element sizes are incompatible with whilewr/rw, so expand instead + return SDValue(); + } + + SDValue PtrA = Op.getOperand(0); + SDValue PtrB = Op.getOperand(1); + + if (VT.isScalableVT()) + return DAG.getNode(Op.getOpcode(), DL, VT, PtrA, PtrB, Op.getOperand(2)); + + // We can use the SVE whilewr/whilerw instruction to lower this + // intrinsic by creating the appropriate sequence of scalable vector + // operations and then extracting a fixed-width subvector from the scalable + // vector. Scalable vector variants are already legal. + EVT ContainerVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), + VT.getVectorNumElements(), true); + EVT WhileVT = ContainerVT.changeElementType(MVT::i1); + + SDValue Mask = + DAG.getNode(Op.getOpcode(), DL, WhileVT, PtrA, PtrB, Op.getOperand(2)); + SDValue MaskAsInt = DAG.getNode(ISD::SIGN_EXTEND, DL, ContainerVT, Mask); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, MaskAsInt, + DAG.getVectorIdxConstant(0, DL)); +} + SDValue AArch64TargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { EVT OpVT = Op.getValueType(); @@ -5987,6 +6051,38 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, EVT PtrVT = getPointerTy(DAG.getDataLayout()); return DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); } + case Intrinsic::aarch64_sve_whilewr_b: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_h: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_s: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(4, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilewr_d: + return DAG.getNode(ISD::LOOP_DEPENDENCE_WAR_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(8, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_b: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(1, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_h: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(2, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_s: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(4, DL, MVT::i64)); + case Intrinsic::aarch64_sve_whilerw_d: + return DAG.getNode(ISD::LOOP_DEPENDENCE_RAW_MASK, DL, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), + DAG.getConstant(8, DL, MVT::i64)); case Intrinsic::aarch64_neon_abs: { EVT Ty = Op.getValueType(); if (Ty == MVT::i64) { @@ -7346,6 +7442,9 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, default: llvm_unreachable("unimplemented operand"); return SDValue(); + case ISD::LOOP_DEPENDENCE_RAW_MASK: + case ISD::LOOP_DEPENDENCE_WAR_MASK: + return LowerLOOP_DEPENDENCE_MASK(Op, DAG); case ISD::BITCAST: return LowerBITCAST(Op, DAG); case ISD::GlobalAddress: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 78d6a507b80d3..8869ecf423a61 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -728,6 +728,7 @@ class AArch64TargetLowering : public TargetLowering { SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerLOOP_DEPENDENCE_MASK(SDValue Op, SelectionDAG &DAG) const; SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 9775238027650..2c0a0bc91b8b1 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -4125,8 +4125,8 @@ let Predicates = [HasSVE2_or_SME] in { defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi, get_active_lane_mask>; // SVE2 pointer conflict compare - defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">; - defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">; + defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", loop_dependence_war_mask>; + defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", loop_dependence_raw_mask>; } // End HasSVE2_or_SME let Predicates = [HasSVEAES, HasNonStreamingSVE_or_SSVE_AES] in { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index a3a7d0f74e1bc..fa22ba8aa113b 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5946,16 +5946,20 @@ class sve2_int_while_rr sz8_64, bits<1> rw, string asm, let isWhile = 1; } -multiclass sve2_int_while_rr rw, string asm, string op> { +multiclass sve2_int_while_rr rw, string asm, SDPatternOperator op> { def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>; def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>; def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>; def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>; - def : SVE_2_Op_Pat(op # _b), i64, i64, !cast(NAME # _B)>; - def : SVE_2_Op_Pat(op # _h), i64, i64, !cast(NAME # _H)>; - def : SVE_2_Op_Pat(op # _s), i64, i64, !cast(NAME # _S)>; - def : SVE_2_Op_Pat(op # _d), i64, i64, !cast(NAME # _D)>; + def : Pat<(nxv16i1 (op i64:$Op1, i64:$Op2, (i64 1))), + (!cast(NAME # _B) $Op1, $Op2)>; + def : Pat<(nxv8i1 (op i64:$Op1, i64:$Op2, (i64 2))), + (!cast(NAME # _H) $Op1, $Op2)>; + def : Pat<(nxv4i1 (op i64:$Op1, i64:$Op2, (i64 4))), + (!cast(NAME # _S) $Op1, $Op2)>; + def : Pat<(nxv2i1 (op i64:$Op1, i64:$Op2, (i64 8))), + (!cast(NAME # _D) $Op1, $Op2)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/alias_mask.ll b/llvm/test/CodeGen/AArch64/alias_mask.ll new file mode 100644 index 0000000000000..9b9c020016bab --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask.ll @@ -0,0 +1,900 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s + +define <16 x i1> @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) + ret <16 x i1> %0 +} + +define <8 x i1> @whilewr_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 2) + ret <8 x i1> %0 +} + +define <4 x i1> @whilewr_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 4) + ret <4 x i1> %0 +} + +define <2 x i1> @whilewr_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret +entry: + %0 = call <2 x i1> @llvm.loop.dependence.war.mask.v2i1(ptr %a, ptr %b, i64 8) + ret <2 x i1> %0 +} + +define <16 x i1> @whilerw_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.b, x0, x1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0 +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.raw.mask.v16i1(ptr %a, ptr %b, i64 1) + ret <16 x i1> %0 +} + +define <8 x i1> @whilerw_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.h, x0, x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.raw.mask.v8i1(ptr %a, ptr %b, i64 2) + ret <8 x i1> %0 +} + +define <4 x i1> @whilerw_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.s, x0, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: ret +entry: + %0 = call <4 x i1> @llvm.loop.dependence.raw.mask.v4i1(ptr %a, ptr %b, i64 4) + ret <4 x i1> %0 +} + +define <2 x i1> @whilerw_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.d, x0, x1 +; CHECK-NEXT: mov z0.d, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: ret +entry: + %0 = call <2 x i1> @llvm.loop.dependence.raw.mask.v2i1(ptr %a, ptr %b, i64 8) + ret <2 x i1> %0 +} + +define <32 x i1> @whilewr_8_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x9, x0, #16 +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: whilewr p1.b, x9, x1 +; CHECK-NEXT: adrp x9, .LCPI8_0 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_0] +; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v2.16b +; CHECK-NEXT: zip1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: str h1, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 1) + ret <32 x i1> %0 +} + +define <64 x i1> @whilewr_8_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: add x9, x0, #48 +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: add x10, x0, #16 +; CHECK-NEXT: whilewr p1.b, x9, x1 +; CHECK-NEXT: add x9, x0, #32 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p0.b, x9, x1 +; CHECK-NEXT: adrp x9, .LCPI9_0 +; CHECK-NEXT: mov z1.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: whilewr p1.b, x10, x1 +; CHECK-NEXT: ldr q4, [x9, :lo12:.LCPI9_0] +; CHECK-NEXT: mov z2.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z3.b, p1/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v2.16b, v2.16b, #7 +; CHECK-NEXT: shl v3.16b, v3.16b, #7 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v2.16b, v2.16b, #0 +; CHECK-NEXT: cmlt v3.16b, v3.16b, #0 +; CHECK-NEXT: and v0.16b, v0.16b, v4.16b +; CHECK-NEXT: and v1.16b, v1.16b, v4.16b +; CHECK-NEXT: and v2.16b, v2.16b, v4.16b +; CHECK-NEXT: and v3.16b, v3.16b, v4.16b +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v6.16b, v2.16b, v2.16b, #8 +; CHECK-NEXT: ext v7.16b, v3.16b, v3.16b, #8 +; CHECK-NEXT: zip1 v0.16b, v0.16b, v4.16b +; CHECK-NEXT: zip1 v1.16b, v1.16b, v5.16b +; CHECK-NEXT: zip1 v2.16b, v2.16b, v6.16b +; CHECK-NEXT: zip1 v3.16b, v3.16b, v7.16b +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h2, v2.8h +; CHECK-NEXT: addv h3, v3.8h +; CHECK-NEXT: str h0, [x8] +; CHECK-NEXT: str h1, [x8, #6] +; CHECK-NEXT: str h2, [x8, #4] +; CHECK-NEXT: str h3, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <64 x i1> @llvm.loop.dependence.war.mask.v64i1(ptr %a, ptr %b, i64 1) + ret <64 x i1> %0 +} + +define <16 x i1> @whilewr_16_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 2) + ret <16 x i1> %0 +} + +define <32 x i1> @whilewr_16_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x10, x9, #32 +; CHECK-NEXT: add x9, x9, x9, lsr #63 +; CHECK-NEXT: add x10, x10, x10, lsr #63 +; CHECK-NEXT: asr x9, x9, #1 +; CHECK-NEXT: asr x10, x10, #1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: dup v7.2d, x9 +; CHECK-NEXT: dup v16.2d, x10 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: cmp x10, #1 +; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 +; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 +; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d +; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe +; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d +; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d +; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d +; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d +; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d +; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d +; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d +; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d +; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d +; CHECK-NEXT: cset w10, lt +; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d +; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s +; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h +; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h +; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: dup v2.16b, w9 +; CHECK-NEXT: adrp x9, .LCPI11_0 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: dup v3.16b, w10 +; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI11_0] +; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 2) + ret <32 x i1> %0 +} + +define <8 x i1> @whilewr_32_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z2.d, z2.d, #4 // =0x4 +; CHECK-NEXT: add z3.d, z3.d, #2 // =0x2 +; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d +; CHECK-NEXT: cmhi v4.2d, v1.2d, v4.2d +; CHECK-NEXT: cmhi v2.2d, v1.2d, v2.2d +; CHECK-NEXT: cmhi v1.2d, v1.2d, v3.2d +; CHECK-NEXT: uzp1 v2.4s, v2.4s, v4.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v1.8b, w8 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 4) + ret <8 x i1> %0 +} + +define <16 x i1> @whilewr_32_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 4) + ret <16 x i1> %0 +} + +define <32 x i1> @whilewr_32_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: add x10, x9, #3 +; CHECK-NEXT: sub x11, x9, #61 +; CHECK-NEXT: csel x10, x10, x9, mi +; CHECK-NEXT: subs x9, x9, #64 +; CHECK-NEXT: csel x9, x11, x9, mi +; CHECK-NEXT: asr x10, x10, #2 +; CHECK-NEXT: asr x9, x9, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: dup v7.2d, x10 +; CHECK-NEXT: dup v16.2d, x9 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 +; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 +; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d +; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe +; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d +; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d +; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d +; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d +; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d +; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d +; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d +; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d +; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d +; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s +; CHECK-NEXT: cmp x10, #1 +; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s +; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s +; CHECK-NEXT: cset w10, lt +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h +; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h +; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: dup v2.16b, w10 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: dup v3.16b, w9 +; CHECK-NEXT: adrp x9, .LCPI14_0 +; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI14_0] +; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 4) + ret <32 x i1> %0 +} + +define <4 x i1> @whilewr_64_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: dup v2.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: add z1.d, z1.d, #2 // =0x2 +; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v1.4h, w8 +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %0 = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr %a, ptr %b, i64 8) + ret <4 x i1> %0 +} + +define <8 x i1> @whilewr_64_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z2.d, z2.d, #4 // =0x4 +; CHECK-NEXT: add z3.d, z3.d, #2 // =0x2 +; CHECK-NEXT: cmhi v0.2d, v1.2d, v0.2d +; CHECK-NEXT: cmhi v4.2d, v1.2d, v4.2d +; CHECK-NEXT: cmhi v2.2d, v1.2d, v2.2d +; CHECK-NEXT: cmhi v1.2d, v1.2d, v3.2d +; CHECK-NEXT: uzp1 v2.4s, v2.4s, v4.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v1.4s +; CHECK-NEXT: dup v1.8b, w8 +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: orr v0.8b, v0.8b, v1.8b +; CHECK-NEXT: ret +entry: + %0 = call <8 x i1> @llvm.loop.dependence.war.mask.v8i1(ptr %a, ptr %b, i64 8) + ret <8 x i1> %0 +} + +define <16 x i1> @whilewr_64_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 8) + ret <16 x i1> %0 +} + +define <32 x i1> @whilewr_64_expand4(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: add x10, x9, #7 +; CHECK-NEXT: sub x11, x9, #121 +; CHECK-NEXT: csel x10, x10, x9, mi +; CHECK-NEXT: subs x9, x9, #128 +; CHECK-NEXT: csel x9, x11, x9, mi +; CHECK-NEXT: asr x10, x10, #3 +; CHECK-NEXT: asr x9, x9, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: dup v7.2d, x10 +; CHECK-NEXT: dup v16.2d, x9 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: add z3.d, z3.d, #8 // =0x8 +; CHECK-NEXT: add z4.d, z4.d, #6 // =0x6 +; CHECK-NEXT: add z5.d, z5.d, #4 // =0x4 +; CHECK-NEXT: add z6.d, z6.d, #2 // =0x2 +; CHECK-NEXT: cmhi v17.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v18.2d, v16.2d, v0.2d +; CHECK-NEXT: add z0.d, z0.d, #14 // =0xe +; CHECK-NEXT: cmhi v19.2d, v7.2d, v1.2d +; CHECK-NEXT: cmhi v20.2d, v7.2d, v2.2d +; CHECK-NEXT: cmhi v21.2d, v7.2d, v3.2d +; CHECK-NEXT: cmhi v22.2d, v7.2d, v4.2d +; CHECK-NEXT: cmhi v23.2d, v7.2d, v5.2d +; CHECK-NEXT: cmhi v24.2d, v7.2d, v6.2d +; CHECK-NEXT: cmhi v1.2d, v16.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v16.2d, v2.2d +; CHECK-NEXT: cmhi v3.2d, v16.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v16.2d, v4.2d +; CHECK-NEXT: cmhi v7.2d, v7.2d, v0.2d +; CHECK-NEXT: cmhi v5.2d, v16.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v16.2d, v6.2d +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: cmhi v0.2d, v16.2d, v0.2d +; CHECK-NEXT: uzp1 v16.4s, v21.4s, v20.4s +; CHECK-NEXT: cmp x10, #1 +; CHECK-NEXT: uzp1 v20.4s, v23.4s, v22.4s +; CHECK-NEXT: uzp1 v17.4s, v17.4s, v24.4s +; CHECK-NEXT: cset w10, lt +; CHECK-NEXT: uzp1 v2.4s, v3.4s, v2.4s +; CHECK-NEXT: uzp1 v3.4s, v19.4s, v7.4s +; CHECK-NEXT: uzp1 v4.4s, v5.4s, v4.4s +; CHECK-NEXT: uzp1 v5.4s, v18.4s, v6.4s +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: uzp1 v1.8h, v17.8h, v20.8h +; CHECK-NEXT: uzp1 v3.8h, v16.8h, v3.8h +; CHECK-NEXT: uzp1 v4.8h, v5.8h, v4.8h +; CHECK-NEXT: uzp1 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: dup v2.16b, w10 +; CHECK-NEXT: uzp1 v1.16b, v1.16b, v3.16b +; CHECK-NEXT: dup v3.16b, w9 +; CHECK-NEXT: adrp x9, .LCPI18_0 +; CHECK-NEXT: uzp1 v0.16b, v4.16b, v0.16b +; CHECK-NEXT: orr v1.16b, v1.16b, v2.16b +; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI18_0] +; CHECK-NEXT: orr v0.16b, v0.16b, v3.16b +; CHECK-NEXT: shl v1.16b, v1.16b, #7 +; CHECK-NEXT: shl v0.16b, v0.16b, #7 +; CHECK-NEXT: cmlt v1.16b, v1.16b, #0 +; CHECK-NEXT: cmlt v0.16b, v0.16b, #0 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: ext v2.16b, v1.16b, v1.16b, #8 +; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: zip1 v1.16b, v1.16b, v2.16b +; CHECK-NEXT: zip1 v0.16b, v0.16b, v3.16b +; CHECK-NEXT: addv h1, v1.8h +; CHECK-NEXT: addv h0, v0.8h +; CHECK-NEXT: str h1, [x8] +; CHECK-NEXT: str h0, [x8, #2] +; CHECK-NEXT: ret +entry: + %0 = call <32 x i1> @llvm.loop.dependence.war.mask.v32i1(ptr %a, ptr %b, i64 8) + ret <32 x i1> %0 +} + +define <9 x i1> @whilewr_8_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: mov z0.b, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: umov w9, v0.b[0] +; CHECK-NEXT: umov w10, v0.b[1] +; CHECK-NEXT: umov w11, v0.b[2] +; CHECK-NEXT: umov w12, v0.b[7] +; CHECK-NEXT: and w9, w9, #0x1 +; CHECK-NEXT: bfi w9, w10, #1, #1 +; CHECK-NEXT: umov w10, v0.b[3] +; CHECK-NEXT: bfi w9, w11, #2, #1 +; CHECK-NEXT: umov w11, v0.b[4] +; CHECK-NEXT: bfi w9, w10, #3, #1 +; CHECK-NEXT: umov w10, v0.b[5] +; CHECK-NEXT: bfi w9, w11, #4, #1 +; CHECK-NEXT: umov w11, v0.b[6] +; CHECK-NEXT: bfi w9, w10, #5, #1 +; CHECK-NEXT: umov w10, v0.b[8] +; CHECK-NEXT: bfi w9, w11, #6, #1 +; CHECK-NEXT: ubfiz w11, w12, #7, #1 +; CHECK-NEXT: orr w9, w9, w11 +; CHECK-NEXT: orr w9, w9, w10, lsl #8 +; CHECK-NEXT: and w9, w9, #0x1ff +; CHECK-NEXT: strh w9, [x8] +; CHECK-NEXT: ret +entry: + %0 = call <9 x i1> @llvm.loop.dependence.war.mask.v9i1(ptr %a, ptr %b, i64 1) + ret <9 x i1> %0 +} + +define <7 x i1> @whilewr_16_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: mov z0.h, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.8b, v0.8h +; CHECK-NEXT: umov w0, v0.b[0] +; CHECK-NEXT: umov w1, v0.b[1] +; CHECK-NEXT: umov w2, v0.b[2] +; CHECK-NEXT: umov w3, v0.b[3] +; CHECK-NEXT: umov w4, v0.b[4] +; CHECK-NEXT: umov w5, v0.b[5] +; CHECK-NEXT: umov w6, v0.b[6] +; CHECK-NEXT: ret +entry: + %0 = call <7 x i1> @llvm.loop.dependence.war.mask.v7i1(ptr %a, ptr %b, i64 2) + ret <7 x i1> %0 +} + +define <3 x i1> @whilewr_32_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: xtn v0.4h, v0.4s +; CHECK-NEXT: umov w0, v0.h[0] +; CHECK-NEXT: umov w1, v0.h[1] +; CHECK-NEXT: umov w2, v0.h[2] +; CHECK-NEXT: ret +entry: + %0 = call <3 x i1> @llvm.loop.dependence.war.mask.v3i1(ptr %a, ptr %b, i64 4) + ret <3 x i1> %0 +} + +define <16 x i1> @whilewr_badimm(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_badimm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: movk x8, #21846 +; CHECK-NEXT: smulh x8, x9, x8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z6.d, z0.d +; CHECK-NEXT: mov z7.d, z0.d +; CHECK-NEXT: mov z16.d, z0.d +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: add z1.d, z1.d, #12 // =0xc +; CHECK-NEXT: add z2.d, z2.d, #10 // =0xa +; CHECK-NEXT: add z4.d, z4.d, #8 // =0x8 +; CHECK-NEXT: add z5.d, z5.d, #6 // =0x6 +; CHECK-NEXT: add z6.d, z6.d, #4 // =0x4 +; CHECK-NEXT: dup v3.2d, x8 +; CHECK-NEXT: add z16.d, z16.d, #14 // =0xe +; CHECK-NEXT: add z7.d, z7.d, #2 // =0x2 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmhi v0.2d, v3.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v3.2d, v1.2d +; CHECK-NEXT: cmhi v2.2d, v3.2d, v2.2d +; CHECK-NEXT: cmhi v4.2d, v3.2d, v4.2d +; CHECK-NEXT: cmhi v16.2d, v3.2d, v16.2d +; CHECK-NEXT: cmhi v5.2d, v3.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v3.2d, v6.2d +; CHECK-NEXT: cmhi v3.2d, v3.2d, v7.2d +; CHECK-NEXT: uzp1 v1.4s, v1.4s, v16.4s +; CHECK-NEXT: uzp1 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp1 v4.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v1.8h +; CHECK-NEXT: uzp1 v0.8h, v0.8h, v4.8h +; CHECK-NEXT: uzp1 v0.16b, v0.16b, v1.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 3) + ret <16 x i1> %0 +} + +; Scalarizing <1 x i1> types + +define <1 x i1> @whilewr_8_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 1) + ret <1 x i1> %0 +} + +define <1 x i1> @whilewr_16_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 2) + ret <1 x i1> %0 +} + +define <1 x i1> @whilewr_32_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #3 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 4) + ret <1 x i1> %0 +} + +define <1 x i1> @whilewr_64_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #7 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.war.mask.v1i1(ptr %a, ptr %b, i64 8) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_8_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_8_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 1) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_16_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_16_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 2) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_32_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_32_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #3 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 4) + ret <1 x i1> %0 +} + +define <1 x i1> @whilerw_64_scalarize(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_64_scalarize: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: cmp x8, #7 +; CHECK-NEXT: cset w8, gt +; CHECK-NEXT: cmp x1, x0 +; CHECK-NEXT: csinc w0, w8, wzr, ne +; CHECK-NEXT: ret +entry: + %0 = call <1 x i1> @llvm.loop.dependence.raw.mask.v1i1(ptr %a, ptr %b, i64 8) + ret <1 x i1> %0 +} diff --git a/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll new file mode 100644 index 0000000000000..922b37c2f2a08 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask_nosve.ll @@ -0,0 +1,48 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s + +define <16 x i1> @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: adrp x8, .LCPI0_0 +; CHECK-NEXT: adrp x10, .LCPI0_1 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI0_0] +; CHECK-NEXT: adrp x8, .LCPI0_2 +; CHECK-NEXT: ldr q1, [x10, :lo12:.LCPI0_1] +; CHECK-NEXT: ldr q3, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: adrp x10, .LCPI0_3 +; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI0_4] +; CHECK-NEXT: adrp x8, .LCPI0_5 +; CHECK-NEXT: dup v2.2d, x9 +; CHECK-NEXT: ldr q4, [x10, :lo12:.LCPI0_3] +; CHECK-NEXT: adrp x10, .LCPI0_6 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI0_5] +; CHECK-NEXT: adrp x8, .LCPI0_7 +; CHECK-NEXT: ldr q7, [x10, :lo12:.LCPI0_6] +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: ldr q16, [x8, :lo12:.LCPI0_7] +; CHECK-NEXT: cmhi v0.2d, v2.2d, v0.2d +; CHECK-NEXT: cmhi v1.2d, v2.2d, v1.2d +; CHECK-NEXT: cmhi v3.2d, v2.2d, v3.2d +; CHECK-NEXT: cmhi v4.2d, v2.2d, v4.2d +; CHECK-NEXT: cmhi v5.2d, v2.2d, v5.2d +; CHECK-NEXT: cmhi v6.2d, v2.2d, v6.2d +; CHECK-NEXT: cmhi v7.2d, v2.2d, v7.2d +; CHECK-NEXT: cmhi v2.2d, v2.2d, v16.2d +; CHECK-NEXT: uzp1 v0.4s, v1.4s, v0.4s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 v1.4s, v4.4s, v3.4s +; CHECK-NEXT: uzp1 v3.4s, v6.4s, v5.4s +; CHECK-NEXT: uzp1 v2.4s, v2.4s, v7.4s +; CHECK-NEXT: uzp1 v0.8h, v1.8h, v0.8h +; CHECK-NEXT: uzp1 v1.8h, v2.8h, v3.8h +; CHECK-NEXT: uzp1 v0.16b, v1.16b, v0.16b +; CHECK-NEXT: dup v1.16b, w8 +; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b +; CHECK-NEXT: ret +entry: + %0 = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr %a, ptr %b, i64 1) + ret <16 x i1> %0 +} diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll new file mode 100644 index 0000000000000..179dcfa11c108 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable.ll @@ -0,0 +1,767 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s + +define @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) + ret %0 +} + +define @whilewr_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 2) + ret %0 +} + +define @whilewr_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 4) + ret %0 +} + +define @whilewr_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.d, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv2i1(ptr %a, ptr %b, i64 8) + ret %0 +} + +define @whilerw_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.raw.mask.nxv16i1(ptr %a, ptr %b, i64 1) + ret %0 +} + +define @whilerw_16(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.h, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.raw.mask.nxv8i1(ptr %a, ptr %b, i64 2) + ret %0 +} + +define @whilerw_32(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.s, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr %a, ptr %b, i64 4) + ret %0 +} + +define @whilerw_64(ptr %a, ptr %b) { +; CHECK-LABEL: whilerw_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.d, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.raw.mask.nxv2i1(ptr %a, ptr %b, i64 8) + ret %0 +} + +define @whilewr_8_split(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: incb x0 +; CHECK-NEXT: whilewr p1.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 1) + ret %0 +} + +define @whilewr_8_split2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_split2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: addvl x9, x0, #3 +; CHECK-NEXT: incb x0, all, mul #2 +; CHECK-NEXT: incb x8 +; CHECK-NEXT: whilewr p3.b, x9, x1 +; CHECK-NEXT: whilewr p2.b, x0, x1 +; CHECK-NEXT: whilewr p1.b, x8, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv64i1(ptr %a, ptr %b, i64 1) + ret %0 +} + +define @whilewr_16_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 2) + ret %0 +} + +define @whilewr_16_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: incb x0, all, mul #2 +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: add x9, x9, x9, lsr #63 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: incd z3.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z0.d +; CHECK-NEXT: asr x9, x9, #1 +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: mov z6.d, z1.d +; CHECK-NEXT: mov z7.d, z2.d +; CHECK-NEXT: cmphi p1.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z2.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z24.d, z4.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p3.s, p4.s +; CHECK-NEXT: uzp1 p3.s, p5.s, p6.s +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z24.d +; CHECK-NEXT: mov z5.d, x9 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d +; CHECK-NEXT: uzp1 p7.s, p7.s, p8.s +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z2.d +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p2.h, p2.h, p7.h +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d +; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s +; CHECK-NEXT: uzp1 p5.s, p9.s, p6.s +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p6.b, xzr, x8 +; CHECK-NEXT: uzp1 p3.s, p8.s, p3.s +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.s, p0.s, p7.s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.h, p5.h, p4.h +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p3.h +; CHECK-NEXT: uzp1 p1.b, p1.b, p2.b +; CHECK-NEXT: uzp1 p2.b, p0.b, p4.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p3.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p1, p1.b, p6.b +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel p1.b, p2, p2.b, p3.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 2) + ret %0 +} + +define @whilewr_32_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z2.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z4.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 4) + ret %0 +} + +define @whilewr_32_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 4) + ret %0 +} + +define @whilewr_32_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: incb x0, all, mul #4 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s +; CHECK-NEXT: mov z24.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s +; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b +; CHECK-NEXT: asr x8, x8, #2 +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s +; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s +; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p4.b, xzr, x8 +; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b +; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 4) + ret %0 +} + +define @whilewr_64_expand(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p1.s, p0.s +; CHECK-NEXT: whilelo p1.s, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv4i1(ptr %a, ptr %b, i64 8) + ret %0 +} + +define @whilewr_64_expand2(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand2: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: cmphi p1.d, p0/z, z3.d, z0.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z3.d, z1.d +; CHECK-NEXT: cmphi p3.d, p0/z, z3.d, z2.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: cmphi p0.d, p0/z, z3.d, z4.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p3.s, p0.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv8i1(ptr %a, ptr %b, i64 8) + ret %0 +} + +define @whilewr_64_expand3(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand3: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 8) + ret %0 +} + +define @whilewr_64_expand4(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_64_expand4: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p10, [sp, #1, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #2, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #3, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: subs x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: addvl x9, x0, #8 +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z2.d, all, mul #2 +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z0.d +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: mov z6.d, z2.d +; CHECK-NEXT: mov z7.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p3.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z1.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: incd z6.d, all, mul #4 +; CHECK-NEXT: incd z7.d, all, mul #4 +; CHECK-NEXT: uzp1 p4.s, p5.s, p4.s +; CHECK-NEXT: mov z24.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z3.d +; CHECK-NEXT: incd z24.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p2.s, p7.s +; CHECK-NEXT: uzp1 p3.s, p3.s, p8.s +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p3.h, p4.h, p3.h +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p6.s, p6.s, p9.s +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: subs x8, x1, x9 +; CHECK-NEXT: uzp1 p2.h, p2.h, p6.h +; CHECK-NEXT: add x9, x8, #7 +; CHECK-NEXT: csel x8, x9, x8, mi +; CHECK-NEXT: uzp1 p2.b, p3.b, p2.b +; CHECK-NEXT: asr x8, x8, #3 +; CHECK-NEXT: mov z5.d, x8 +; CHECK-NEXT: cmphi p5.d, p0/z, z5.d, z24.d +; CHECK-NEXT: cmphi p7.d, p0/z, z5.d, z6.d +; CHECK-NEXT: cmphi p8.d, p0/z, z5.d, z7.d +; CHECK-NEXT: cmphi p9.d, p0/z, z5.d, z4.d +; CHECK-NEXT: cmphi p4.d, p0/z, z5.d, z3.d +; CHECK-NEXT: cmphi p10.d, p0/z, z5.d, z2.d +; CHECK-NEXT: cmphi p6.d, p0/z, z5.d, z1.d +; CHECK-NEXT: cmphi p0.d, p0/z, z5.d, z0.d +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: uzp1 p5.s, p7.s, p5.s +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 p7.s, p9.s, p8.s +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: ldr p9, [sp, #2, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p4.s, p10.s, p4.s +; CHECK-NEXT: ldr p10, [sp, #1, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.s, p0.s, p6.s +; CHECK-NEXT: ldr p8, [sp, #3, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p5.h, p7.h, p5.h +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p0.h, p4.h +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: whilelo p4.b, xzr, x8 +; CHECK-NEXT: uzp1 p3.b, p0.b, p5.b +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: sel p0.b, p2, p2.b, p1.b +; CHECK-NEXT: sel p1.b, p3, p3.b, p4.b +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv32i1(ptr %a, ptr %b, i64 8) + ret %0 +} + +define @whilewr_8_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.b, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv9i1(ptr %a, ptr %b, i64 1) + ret %0 +} + +define @whilewr_16_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_16_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.h, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv7i1(ptr %a, ptr %b, i64 2) + ret %0 +} + +define @whilewr_32_widen(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_32_widen: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilewr p0.s, x0, x1 +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv3i1(ptr %a, ptr %b, i64 4) + ret %0 +} + +define @whilewr_badimm(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_badimm: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: sub x9, x1, x0 +; CHECK-NEXT: movk x8, #21846 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: smulh x8, x9, x8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z5.d, z0.d +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: incd z5.d, all, mul #4 +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: mov z3.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z0.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z5.d +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z1.d +; CHECK-NEXT: mov z0.d, z3.d +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z3.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p4.s, p5.s +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p3.s, p3.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z0.d +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 3) + ret %0 +} diff --git a/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll new file mode 100644 index 0000000000000..8b5ea0bc3b3ce --- /dev/null +++ b/llvm/test/CodeGen/AArch64/alias_mask_scalable_nosve2.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64 -mattr=+sve %s -o - | FileCheck %s + +define @whilewr_8(ptr %a, ptr %b) { +; CHECK-LABEL: whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p7, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: sub x8, x1, x0 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: mov z2.d, x8 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: cmphi p1.d, p0/z, z2.d, z0.d +; CHECK-NEXT: incd z0.d, all, mul #4 +; CHECK-NEXT: incd z1.d +; CHECK-NEXT: incd z3.d, all, mul #2 +; CHECK-NEXT: cmphi p5.d, p0/z, z2.d, z0.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: cmphi p2.d, p0/z, z2.d, z1.d +; CHECK-NEXT: incd z1.d, all, mul #4 +; CHECK-NEXT: cmphi p3.d, p0/z, z2.d, z3.d +; CHECK-NEXT: incd z3.d, all, mul #4 +; CHECK-NEXT: incd z4.d, all, mul #2 +; CHECK-NEXT: cmphi p6.d, p0/z, z2.d, z1.d +; CHECK-NEXT: cmphi p7.d, p0/z, z2.d, z3.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: cmphi p4.d, p0/z, z2.d, z4.d +; CHECK-NEXT: incd z4.d, all, mul #4 +; CHECK-NEXT: uzp1 p2.s, p5.s, p6.s +; CHECK-NEXT: ldr p6, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cmphi p0.d, p0/z, z2.d, z4.d +; CHECK-NEXT: uzp1 p3.s, p3.s, p4.s +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: uzp1 p1.h, p1.h, p3.h +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: uzp1 p0.s, p7.s, p0.s +; CHECK-NEXT: ldr p7, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p0.h, p2.h, p0.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +entry: + %0 = call @llvm.loop.dependence.war.mask.nxv16i1(ptr %a, ptr %b, i64 1) + ret %0 +}