diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 425ea311d653a..091d94843698c 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -166,6 +166,7 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, case Intrinsic::is_fpclass: case Intrinsic::vp_is_fpclass: case Intrinsic::powi: + case Intrinsic::vector_extract: return (ScalarOpdIdx == 1); case Intrinsic::smul_fix: case Intrinsic::smul_fix_sat: @@ -200,6 +201,7 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg( case Intrinsic::vp_llrint: case Intrinsic::ucmp: case Intrinsic::scmp: + case Intrinsic::vector_extract: return OpdIdx == -1 || OpdIdx == 0; case Intrinsic::modf: case Intrinsic::sincos: diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 6317bc3c20e25..ac362e71a5c7f 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4214,9 +4214,16 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { } } } - [[fallthrough]]; + C += VPI->cost(VF, CostCtx); + break; + } + case VPInstruction::ActiveLaneMask: { + unsigned Multiplier = + cast(VPI->getOperand(2)->getLiveInIRValue()) + ->getZExtValue(); + C += VPI->cost(VF * Multiplier, CostCtx); + break; } - case VPInstruction::ActiveLaneMask: case VPInstruction::ExplicitVectorLength: C += VPI->cost(VF, CostCtx); break; diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 33bcb49b81740..da0390951329d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -975,6 +975,10 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, Not, SLPLoad, SLPStore, + // Creates a mask where each lane is active (true) whilst the current + // counter (first operand + index) is less than the second operand. i.e. + // mask[i] = icmpt ult (op0 + i), op1 + // The size of the mask returned is VF * Multiplier (UF, third op). ActiveLaneMask, ExplicitVectorLength, CalculateTripCountMinusVF, @@ -1999,6 +2003,9 @@ class LLVM_ABI_FOR_TEST VPHeaderPHIRecipe : public VPSingleDefRecipe, return getOperand(1); } + /// Update the incoming value from the loop backedge. + void setBackedgeValue(VPValue *V) { setOperand(1, V); } + /// Returns the backedge value as a recipe. The backedge value is guaranteed /// to be a recipe. virtual VPRecipeBase &getBackedgeRecipe() { diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index b33359c9bb0d6..d51c4aade02ad 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -316,10 +316,10 @@ m_ExtractLastElement(const Op0_t &Op0) { return m_VPInstruction(Op0); } -template -inline VPInstruction_match -m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1) { - return m_VPInstruction(Op0, Op1); +template +inline VPInstruction_match +m_ActiveLaneMask(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { + return m_VPInstruction(Op0, Op1, Op2); } template diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 86834ab1240c1..a1efecb358c43 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -471,7 +471,6 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case Instruction::ICmp: case Instruction::FCmp: case Instruction::Store: - case VPInstruction::ActiveLaneMask: case VPInstruction::BranchOnCount: case VPInstruction::ComputeReductionResult: case VPInstruction::FirstOrderRecurrenceSplice: @@ -481,6 +480,7 @@ unsigned VPInstruction::getNumOperandsForOpcode(unsigned Opcode) { case VPInstruction::WideIVStep: return 2; case Instruction::Select: + case VPInstruction::ActiveLaneMask: case VPInstruction::ComputeAnyOfResult: case VPInstruction::ReductionStartVector: return 3; @@ -620,7 +620,9 @@ Value *VPInstruction::generate(VPTransformState &State) { Name); auto *Int1Ty = Type::getInt1Ty(Builder.getContext()); - auto *PredTy = VectorType::get(Int1Ty, State.VF); + auto PredTy = VectorType::get( + Int1Ty, State.VF * cast(getOperand(2)->getLiveInIRValue()) + ->getZExtValue()); return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); @@ -1091,7 +1093,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF, } case VPInstruction::ActiveLaneMask: { Type *ArgTy = Ctx.Types.inferScalarType(getOperand(0)); - Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF); + unsigned Multiplier = + cast(getOperand(2)->getLiveInIRValue())->getZExtValue(); + Type *RetTy = toVectorTy(Type::getInt1Ty(Ctx.LLVMCtx), VF * Multiplier); IntrinsicCostAttributes Attrs(Intrinsic::get_active_lane_mask, RetTy, {ArgTy, ArgTy}); return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index d32d2a9ad11f7..6ac133f0b84bd 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -39,6 +39,10 @@ using namespace llvm; using namespace VPlanPatternMatch; +cl::opt EnableWideActiveLaneMask( + "enable-wide-lane-mask", cl::init(false), cl::Hidden, + cl::desc("Enable use of wide get active lane mask instructions")); + bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes( VPlanPtr &Plan, function_ref @@ -1467,6 +1471,102 @@ static bool isConditionTrueViaVFAndUF(VPValue *Cond, VPlan &Plan, return SE.isKnownPredicate(CmpInst::ICMP_EQ, VectorTripCount, C); } +/// Try to replace multiple active lane masks used for control flow with +/// a single, wide active lane mask instruction followed by multiple +/// extract subvector intrinsics. This applies to the active lane mask +/// instructions both in the loop and in the preheader. +/// Incoming values of all ActiveLaneMaskPHIs are updated to use the +/// new extracts from the first active lane mask, which has it's last +/// operand (multiplier) set to UF. +static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF, + unsigned UF) { + if (!EnableWideActiveLaneMask || !VF.isVector() || UF == 1) + return false; + + VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); + auto *Term = &ExitingVPBB->back(); + + using namespace llvm::VPlanPatternMatch; + if (!match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) + return false; + + auto *Header = cast(VectorRegion->getEntry()); + LLVMContext &Ctx = Plan.getContext(); + + auto ExtractFromALM = [&](VPInstruction *ALM, + SmallVectorImpl &Extracts) { + DebugLoc DL = ALM->getDebugLoc(); + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Ops; + Ops.append({ALM, Plan.getOrAddLiveIn( + ConstantInt::get(IntegerType::getInt64Ty(Ctx), + VF.getKnownMinValue() * Part))}); + auto *Ext = new VPWidenIntrinsicRecipe(Intrinsic::vector_extract, Ops, + IntegerType::getInt1Ty(Ctx), DL); + Extracts[Part] = Ext; + Ext->insertAfter(ALM); + } + }; + + // Create a list of each active lane mask phi, ordered by unroll part. + SmallVector Phis(UF, nullptr); + for (VPRecipeBase &R : Header->phis()) { + auto *Phi = dyn_cast(&R); + if (!Phi) + continue; + VPValue *Index = nullptr; + match(Phi->getBackedgeValue(), + m_ActiveLaneMask(m_VPValue(Index), m_VPValue(), m_VPValue())); + assert(Index && "Expected index from ActiveLaneMask instruction"); + + auto *II = dyn_cast(Index); + if (II && II->getOpcode() == VPInstruction::CanonicalIVIncrementForPart) { + auto Part = cast(II->getOperand(1)->getLiveInIRValue()); + Phis[Part->getZExtValue()] = Phi; + } else + // Anything other than a CanonicalIVIncrementForPart is part 0 + Phis[0] = Phi; + } + + assert(all_of(Phis, [](VPActiveLaneMaskPHIRecipe *Phi) { return Phi; }) && + "Expected one VPActiveLaneMaskPHIRecipe for each unroll part"); + + auto *EntryALM = cast(Phis[0]->getStartValue()); + auto *LoopALM = cast(Phis[0]->getBackedgeValue()); + + assert((EntryALM->getOpcode() == VPInstruction::ActiveLaneMask && + LoopALM->getOpcode() == VPInstruction::ActiveLaneMask) && + "Expected incoming values of Phi to be ActiveLaneMasks"); + + // When using wide lane masks, the return type of the get.active.lane.mask + // intrinsic is VF x UF (last operand). + VPValue *ALMMultiplier = + Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF)); + EntryALM->setOperand(2, ALMMultiplier); + LoopALM->setOperand(2, ALMMultiplier); + + // Create UF x extract vectors and insert into preheader. + SmallVector EntryExtracts(UF); + ExtractFromALM(EntryALM, EntryExtracts); + + // Create UF x extract vectors and insert before the loop compare & branch, + // updating the compare to use the first extract. + SmallVector LoopExtracts(UF); + ExtractFromALM(LoopALM, LoopExtracts); + VPInstruction *Not = cast(Term->getOperand(0)); + Not->setOperand(0, LoopExtracts[0]); + + // Update the incoming values of active lane mask phis. + for (unsigned Part = 0; Part < UF; ++Part) { + Phis[Part]->setStartValue(EntryExtracts[Part]); + Phis[Part]->setBackedgeValue(LoopExtracts[Part]); + } + + return true; +} + /// Try to simplify the branch condition of \p Plan. This may restrict the /// resulting plan to \p BestVF and \p BestUF. static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, @@ -1478,8 +1578,8 @@ static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, VPValue *Cond; ScalarEvolution &SE = *PSE.getSE(); if (match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) || - match(Term, m_BranchOnCond( - m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) { + match(Term, m_BranchOnCond(m_Not(m_ActiveLaneMask( + m_VPValue(), m_VPValue(), m_VPValue()))))) { // Try to simplify the branch condition if TC <= VF * UF when the latch // terminator is BranchOnCount or BranchOnCond where the input is // Not(ActiveLaneMask). @@ -1558,8 +1658,8 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); - bool MadeChange = - simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); + bool MadeChange = tryToReplaceALMWithWideALM(Plan, BestVF, BestUF); + MadeChange |= simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); if (MadeChange) { @@ -2042,9 +2142,11 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( "index.part.next"); // Create the active lane mask instruction in the VPlan preheader. - auto *EntryALM = - Builder.createNaryOp(VPInstruction::ActiveLaneMask, {EntryIncrement, TC}, - DL, "active.lane.mask.entry"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {EntryIncrement, TC, ALMMultiplier}, DL, + "active.lane.mask.entry"); // Now create the ActiveLaneMaskPhi recipe in the main loop using the // preheader ActiveLaneMask instruction. @@ -2059,8 +2161,8 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, {false, false}, DL); auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask, - {InLoopIncrement, TripCount}, DL, - "active.lane.mask.next"); + {InLoopIncrement, TripCount, ALMMultiplier}, + DL, "active.lane.mask.next"); LaneMaskPhi->addOperand(ALM); // Replace the original terminator with BranchOnCond. We have to invert the @@ -2139,9 +2241,12 @@ void VPlanTransforms::addActiveLaneMask( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPValue *ALMMultiplier = Plan.getOrAddLiveIn( + ConstantInt::get(Plan.getCanonicalIV()->getScalarType(), 1)); + LaneMask = + B.createNaryOp(VPInstruction::ActiveLaneMask, + {WideCanonicalIV, Plan.getTripCount(), ALMMultiplier}, + nullptr, "active.lane.mask"); } // Walk users of WideCanonicalIV and replace the header mask of the form diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index 700a733bf9f2c..c6c1ef3369825 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -65,7 +65,7 @@ bool vputils::isHeaderMask(const VPValue *V, VPlan &Plan) { VPValue *A, *B; using namespace VPlanPatternMatch; - if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B)))) + if (match(V, m_ActiveLaneMask(m_VPValue(A), m_VPValue(B), m_SpecificInt(1)))) return B == Plan.getTripCount() && (match(A, m_ScalarIVSteps(m_Specific(Plan.getCanonicalIV()), m_SpecificInt(1), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll new file mode 100644 index 0000000000000..8dd9dba1758ab --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-wide-lane-mask.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 +; RUN: opt -S -passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1 +; RUN: opt -S --passes=loop-vectorize -scalable-vectorization=off -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask \ +; RUN: -force-vector-width=4 -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 + +target triple = "aarch64-unknown-linux" + +define void @fixed_wide_active_lane_mask(ptr noalias %dst, ptr noalias readonly %src, i64 %n) #0 { +; CHECK-UF1-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF1-SAME: ptr noalias [[DST:%.*]], ptr noalias readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UF1-NEXT: entry: +; CHECK-UF1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-UF1: vector.ph: +; CHECK-UF1-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 4 +; CHECK-UF1-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[N]], 4 +; CHECK-UF1-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 0 +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 0, i64 [[N]]) +; CHECK-UF1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF1: vector.body: +; CHECK-UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF1-NEXT: [[TMP3:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP3]], i64 0 +; CHECK-UF1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF1-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[INDEX]], i64 [[TMP2]]) +; CHECK-UF1-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-UF1-NEXT: [[TMP6:%.*]] = xor i1 [[TMP5]], true +; CHECK-UF1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF1: middle.block: +; +; CHECK-UF4-LABEL: define void @fixed_wide_active_lane_mask( +; CHECK-UF4-SAME: ptr noalias [[DST:%.*]], ptr noalias readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UF4-NEXT: entry: +; CHECK-UF4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK-UF4: vector.ph: +; CHECK-UF4-NEXT: [[TMP0:%.*]] = sub i64 [[N]], 16 +; CHECK-UF4-NEXT: [[TMP5:%.*]] = icmp ugt i64 [[N]], 16 +; CHECK-UF4-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP0]], i64 0 +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 [[N]]) +; CHECK-UF4-NEXT: [[TMP4:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 12) +; CHECK-UF4-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 8) +; CHECK-UF4-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-UF4-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 4, i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY2:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 8, i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 12, i64 [[N]]) +; CHECK-UF4-NEXT: br label [[VECTOR_BODY1:%.*]] +; CHECK-UF4: vector.body: +; CHECK-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <4 x i1> [ [[TMP1]], [[ENTRY]] ], [ [[TMP9:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK4:%.*]] = phi <4 x i1> [ [[TMP2]], [[ENTRY]] ], [ [[TMP10:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK5:%.*]] = phi <4 x i1> [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi <4 x i1> [ [[TMP4]], [[ENTRY]] ], [ [[TMP12:%.*]], [[VECTOR_BODY1]] ] +; CHECK-UF4-NEXT: [[TMP7:%.*]] = load i32, ptr [[SRC]], align 4 +; CHECK-UF4-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TMP7]], i64 0 +; CHECK-UF4-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-UF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF4-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 4 +; CHECK-UF4-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 8 +; CHECK-UF4-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 12 +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP8]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP17]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK4]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP18]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK5]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP19]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK6]]) +; CHECK-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 +; CHECK-UF4-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 4 +; CHECK-UF4-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 8 +; CHECK-UF4-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], 12 +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[TMP12]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 12) +; CHECK-UF4-NEXT: [[TMP11]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 8) +; CHECK-UF4-NEXT: [[TMP10]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-UF4-NEXT: [[TMP9]] = call <4 x i1> @llvm.vector.extract.v4i1.v16i1(<16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT7:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP13]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT8:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP14]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT9:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 [[TMP15]], i64 [[TMP6]]) +; CHECK-UF4-NEXT: [[TMP21:%.*]] = extractelement <4 x i1> [[TMP9]], i32 0 +; CHECK-UF4-NEXT: [[TMP20:%.*]] = xor i1 [[TMP21]], true +; CHECK-UF4-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY1]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF4: middle.block: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %ld = load i32, ptr %src + %arrayidx = getelementptr inbounds i32, ptr %dst, i64 %iv + store i32 %ld, ptr %arrayidx + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind "target-features"="+neon,+sve" } + +;. +; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. +; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll new file mode 100644 index 0000000000000..5d318146f48ad --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-wide-lane-mask.ll @@ -0,0 +1,304 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^middle.block:" --version 4 +; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=1 < %s | FileCheck %s -check-prefix CHECK-UF1 +; RUN: opt -S --passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -force-vector-interleave=4 < %s | FileCheck %s -check-prefix CHECK-UF4 + +target triple = "aarch64-unknown-linux" + +define void @scalable_wide_active_lane_mask(ptr noalias %dst, ptr readonly %src, i64 %n) #0 { +; CHECK-UF1-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-UF1-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UF1-NEXT: entry: +; CHECK-UF1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-UF1: vector.ph: +; CHECK-UF1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF1-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP5]], 16 +; CHECK-UF1-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF1-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 4 +; CHECK-UF1-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP18]] +; CHECK-UF1-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP18]] +; CHECK-UF1-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-UF1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF1: vector.body: +; CHECK-UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH1]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-UF1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP10]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-UF1-NEXT: [[TMP6:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-UF1-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF1-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP6]], ptr [[TMP13]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-UF1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP12]] +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-UF1-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-UF1-NEXT: [[TMP11:%.*]] = xor i1 [[TMP14]], true +; CHECK-UF1-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF1: middle.block: +; +; CHECK-UF4-LABEL: define void @scalable_wide_active_lane_mask( +; CHECK-UF4-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-UF4-NEXT: entry: +; CHECK-UF4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]] +; CHECK-UF4: vector.ph: +; CHECK-UF4-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP62:%.*]] = mul nuw i64 [[TMP61]], 64 +; CHECK-UF4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 6 +; CHECK-UF4-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP3]] +; CHECK-UF4-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP3]] +; CHECK-UF4-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 +; CHECK-UF4-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4 +; CHECK-UF4-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP11]] +; CHECK-UF4-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 5 +; CHECK-UF4-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP13]] +; CHECK-UF4-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 48 +; CHECK-UF4-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP15]] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 0, i64 [[N]]) +; CHECK-UF4-NEXT: [[TMP19:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 48) +; CHECK-UF4-NEXT: [[TMP18:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 32) +; CHECK-UF4-NEXT: [[TMP17:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 16) +; CHECK-UF4-NEXT: [[TMP16:%.*]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF4: vector.body: +; CHECK-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP16]], [[VECTOR_PH1]] ], [ [[TMP55:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[TMP17]], [[VECTOR_PH1]] ], [ [[TMP56:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[TMP18]], [[VECTOR_PH1]] ], [ [[TMP57:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[TMP19]], [[VECTOR_PH1]] ], [ [[TMP58:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-UF4-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 4 +; CHECK-UF4-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP23]] +; CHECK-UF4-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP32:%.*]] = shl nuw i64 [[TMP31]], 5 +; CHECK-UF4-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-UF4-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP29:%.*]] = mul nuw i64 [[TMP34]], 48 +; CHECK-UF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i64 [[TMP29]] +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP20]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP24]], i32 1, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP33]], i32 1, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[TMP30]], i32 1, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-UF4-NEXT: [[TMP25:%.*]] = mul [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-UF4-NEXT: [[TMP26:%.*]] = mul [[WIDE_MASKED_LOAD9]], splat (i8 3) +; CHECK-UF4-NEXT: [[TMP27:%.*]] = mul [[WIDE_MASKED_LOAD10]], splat (i8 3) +; CHECK-UF4-NEXT: [[TMP28:%.*]] = mul [[WIDE_MASKED_LOAD11]], splat (i8 3) +; CHECK-UF4-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF4-NEXT: [[TMP37:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP38:%.*]] = shl nuw i64 [[TMP37]], 4 +; CHECK-UF4-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP38]] +; CHECK-UF4-NEXT: [[TMP40:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP41:%.*]] = shl nuw i64 [[TMP40]], 5 +; CHECK-UF4-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP41]] +; CHECK-UF4-NEXT: [[TMP43:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP44:%.*]] = mul nuw i64 [[TMP43]], 48 +; CHECK-UF4-NEXT: [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[TMP35]], i64 [[TMP44]] +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP25]], ptr [[TMP35]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP26]], ptr [[TMP39]], i32 1, [[ACTIVE_LANE_MASK6]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP27]], ptr [[TMP42]], i32 1, [[ACTIVE_LANE_MASK7]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP28]], ptr [[TMP45]], i32 1, [[ACTIVE_LANE_MASK8]]) +; CHECK-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP62]] +; CHECK-UF4-NEXT: [[TMP46:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP47:%.*]] = shl nuw i64 [[TMP46]], 4 +; CHECK-UF4-NEXT: [[TMP48:%.*]] = add i64 [[INDEX]], [[TMP47]] +; CHECK-UF4-NEXT: [[TMP49:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP50:%.*]] = shl nuw i64 [[TMP49]], 5 +; CHECK-UF4-NEXT: [[TMP51:%.*]] = add i64 [[INDEX]], [[TMP50]] +; CHECK-UF4-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP53:%.*]] = mul nuw i64 [[TMP52]], 48 +; CHECK-UF4-NEXT: [[TMP54:%.*]] = add i64 [[INDEX]], [[TMP53]] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv64i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-UF4-NEXT: [[TMP58]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 48) +; CHECK-UF4-NEXT: [[TMP57]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 32) +; CHECK-UF4-NEXT: [[TMP56]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 16) +; CHECK-UF4-NEXT: [[TMP55]] = call @llvm.vector.extract.nxv16i1.nxv64i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT12:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP48]], i64 [[TMP9]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT13:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP51]], i64 [[TMP9]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT14:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP54]], i64 [[TMP9]]) +; CHECK-UF4-NEXT: [[TMP59:%.*]] = extractelement [[TMP55]], i32 0 +; CHECK-UF4-NEXT: [[TMP60:%.*]] = xor i1 [[TMP59]], true +; CHECK-UF4-NEXT: br i1 [[TMP60]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-UF4: middle.block: +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds i8, ptr %src, i64 %iv + %ld = load i8, ptr %arrayidx1 + %mul = mul i8 %ld, 3 + %arrayidx2 = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 %mul, ptr %arrayidx2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +define void @scalable_wide_active_lane_mask_double(ptr noalias %dst, ptr readonly %src, i64 %n) #0 { +; CHECK-UF1-LABEL: define void @scalable_wide_active_lane_mask_double( +; CHECK-UF1-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UF1-NEXT: entry: +; CHECK-UF1-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-UF1-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-UF1: for.body.preheader: +; CHECK-UF1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UF1: vector.ph: +; CHECK-UF1-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP12]], 2 +; CHECK-UF1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF1-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP2]], 1 +; CHECK-UF1-NEXT: [[TMP10:%.*]] = sub i64 [[N]], [[TMP9]] +; CHECK-UF1-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[N]], [[TMP9]] +; CHECK-UF1-NEXT: [[TMP13:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0 +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]]) +; CHECK-UF1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF1: vector.body: +; CHECK-UF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF1-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-UF1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-UF1-NEXT: [[TMP3:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-UF1-NEXT: [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF1-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP3]], ptr [[TMP8]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-UF1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] +; CHECK-UF1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP13]]) +; CHECK-UF1-NEXT: [[TMP7:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-UF1-NEXT: [[TMP6:%.*]] = xor i1 [[TMP7]], true +; CHECK-UF1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-UF1: middle.block: +; +; CHECK-UF4-LABEL: define void @scalable_wide_active_lane_mask_double( +; CHECK-UF4-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-UF4-NEXT: entry: +; CHECK-UF4-NEXT: [[CMP6:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-UF4-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-UF4: for.body.preheader: +; CHECK-UF4-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-UF4: vector.ph: +; CHECK-UF4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8 +; CHECK-UF4-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP26:%.*]] = shl nuw i64 [[TMP4]], 3 +; CHECK-UF4-NEXT: [[TMP31:%.*]] = sub i64 [[N]], [[TMP26]] +; CHECK-UF4-NEXT: [[TMP56:%.*]] = icmp ugt i64 [[N]], [[TMP26]] +; CHECK-UF4-NEXT: [[WIDE_TRIP_COUNT:%.*]] = select i1 [[TMP56]], i64 [[TMP31]], i64 0 +; CHECK-UF4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1 +; CHECK-UF4-NEXT: [[INDEX_PART_NEXT:%.*]] = add i64 0, [[TMP6]] +; CHECK-UF4-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2 +; CHECK-UF4-NEXT: [[INDEX_PART_NEXT1:%.*]] = add i64 0, [[TMP8]] +; CHECK-UF4-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 6 +; CHECK-UF4-NEXT: [[INDEX_PART_NEXT2:%.*]] = add i64 0, [[TMP10]] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]]) +; CHECK-UF4-NEXT: [[TMP14:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 6) +; CHECK-UF4-NEXT: [[TMP13:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 4) +; CHECK-UF4-NEXT: [[TMP12:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 2) +; CHECK-UF4-NEXT: [[TMP11:%.*]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_ENTRY]], i64 0) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY3:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT]], i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY4:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT1]], i64 [[N]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_ENTRY5:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_PART_NEXT2]], i64 [[N]]) +; CHECK-UF4-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-UF4: vector.body: +; CHECK-UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[TMP11]], [[VECTOR_PH]] ], [ [[TMP50:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK6:%.*]] = phi [ [[TMP12]], [[VECTOR_PH]] ], [ [[TMP51:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK7:%.*]] = phi [ [[TMP13]], [[VECTOR_PH]] ], [ [[TMP52:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK8:%.*]] = phi [ [[TMP14]], [[VECTOR_PH]] ], [ [[TMP53:%.*]], [[VECTOR_BODY]] ] +; CHECK-UF4-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[SRC]], i64 [[INDEX]] +; CHECK-UF4-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP28:%.*]] = shl nuw i64 [[TMP27]], 1 +; CHECK-UF4-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP28]] +; CHECK-UF4-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP21:%.*]] = shl nuw i64 [[TMP20]], 2 +; CHECK-UF4-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP21]] +; CHECK-UF4-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP24:%.*]] = mul nuw i64 [[TMP23]], 6 +; CHECK-UF4-NEXT: [[TMP25:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP24]] +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP15]], i32 8, [[ACTIVE_LANE_MASK]], poison) +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP29]], i32 8, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP22]], i32 8, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-UF4-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv2f64.p0(ptr [[TMP25]], i32 8, [[ACTIVE_LANE_MASK8]], poison) +; CHECK-UF4-NEXT: [[TMP16:%.*]] = fmul [[WIDE_MASKED_LOAD]], splat (double 3.000000e+00) +; CHECK-UF4-NEXT: [[TMP17:%.*]] = fmul [[WIDE_MASKED_LOAD9]], splat (double 3.000000e+00) +; CHECK-UF4-NEXT: [[TMP18:%.*]] = fmul [[WIDE_MASKED_LOAD10]], splat (double 3.000000e+00) +; CHECK-UF4-NEXT: [[TMP19:%.*]] = fmul [[WIDE_MASKED_LOAD11]], splat (double 3.000000e+00) +; CHECK-UF4-NEXT: [[TMP30:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]] +; CHECK-UF4-NEXT: [[TMP32:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP33:%.*]] = shl nuw i64 [[TMP32]], 1 +; CHECK-UF4-NEXT: [[TMP34:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP33]] +; CHECK-UF4-NEXT: [[TMP35:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP36:%.*]] = shl nuw i64 [[TMP35]], 2 +; CHECK-UF4-NEXT: [[TMP37:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP36]] +; CHECK-UF4-NEXT: [[TMP38:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP39:%.*]] = mul nuw i64 [[TMP38]], 6 +; CHECK-UF4-NEXT: [[TMP40:%.*]] = getelementptr inbounds double, ptr [[TMP30]], i64 [[TMP39]] +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP16]], ptr [[TMP30]], i32 8, [[ACTIVE_LANE_MASK]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP17]], ptr [[TMP34]], i32 8, [[ACTIVE_LANE_MASK6]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP18]], ptr [[TMP37]], i32 8, [[ACTIVE_LANE_MASK7]]) +; CHECK-UF4-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP19]], ptr [[TMP40]], i32 8, [[ACTIVE_LANE_MASK8]]) +; CHECK-UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]] +; CHECK-UF4-NEXT: [[TMP41:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP42:%.*]] = shl nuw i64 [[TMP41]], 1 +; CHECK-UF4-NEXT: [[TMP43:%.*]] = add i64 [[INDEX]], [[TMP42]] +; CHECK-UF4-NEXT: [[TMP44:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP45:%.*]] = shl nuw i64 [[TMP44]], 2 +; CHECK-UF4-NEXT: [[TMP46:%.*]] = add i64 [[INDEX]], [[TMP45]] +; CHECK-UF4-NEXT: [[TMP47:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-UF4-NEXT: [[TMP48:%.*]] = mul nuw i64 [[TMP47]], 6 +; CHECK-UF4-NEXT: [[TMP49:%.*]] = add i64 [[INDEX]], [[TMP48]] +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-UF4-NEXT: [[TMP53]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 6) +; CHECK-UF4-NEXT: [[TMP52]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 4) +; CHECK-UF4-NEXT: [[TMP51]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 2) +; CHECK-UF4-NEXT: [[TMP50]] = call @llvm.vector.extract.nxv2i1.nxv8i1( [[ACTIVE_LANE_MASK_NEXT]], i64 0) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT12:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP43]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT13:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP46]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-UF4-NEXT: [[ACTIVE_LANE_MASK_NEXT14:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP49]], i64 [[WIDE_TRIP_COUNT]]) +; CHECK-UF4-NEXT: [[TMP54:%.*]] = extractelement [[TMP50]], i32 0 +; CHECK-UF4-NEXT: [[TMP55:%.*]] = xor i1 [[TMP54]], true +; CHECK-UF4-NEXT: br i1 [[TMP55]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-UF4: middle.block: +; +entry: + %cmp6 = icmp sgt i64 %n, 0 + br i1 %cmp6, label %for.body, label %for.end + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidx1 = getelementptr inbounds double, ptr %src, i64 %iv + %ld = load double, ptr %arrayidx1 + %mul = fmul double %ld, 3.000000e+00 + %arrayidx2 = getelementptr inbounds double, ptr %dst, i64 %iv + store double %mul, ptr %arrayidx2 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +attributes #0 = { nounwind vscale_range(1,16) "target-features"="+sve2p1" } + +;. +; CHECK-UF1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF1: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-UF1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +;. +; CHECK-UF4: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK-UF4: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK-UF4: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK-UF4: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} +;. diff --git a/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll new file mode 100644 index 0000000000000..62ea3ead3ef7f --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/ARM/active-lane-mask.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -enable-wide-lane-mask -S | FileCheck %s + +target triple = "thumbv8.1m.main-arm-unknown-eabihf" + +define void @f0(ptr noalias %dst, ptr readonly %src, i64 %n) #0 { +; CHECK-LABEL: define void @f0( +; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[VAL:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[VAL]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_END:.*]] +; CHECK: [[FOR_BODY_PREHEADER]]: +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], 31 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 32 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 16 +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[INDEX]], i64 [[N]]) +; CHECK-NEXT: [[ACTIVE_LANE_MASK1:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[TMP0]], i64 [[N]]) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP1]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]], <16 x i8> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr [[TMP3]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]], <16 x i8> poison) +; CHECK-NEXT: [[TMP4:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 3) +; CHECK-NEXT: [[TMP5:%.*]] = mul <16 x i8> [[WIDE_MASKED_LOAD2]], splat (i8 3) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i32 16 +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP4]], ptr [[TMP6]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.v16i8.p0(<16 x i8> [[TMP5]], ptr [[TMP8]], i32 1, <16 x i1> [[ACTIVE_LANE_MASK1]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 32 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[FOR_END_LOOPEXIT:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[FOR_BODY:.*]] +; CHECK: [[FOR_BODY]]: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ 0, %[[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[MUL:%.*]] = mul i8 [[TMP10]], 3 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i8 [[MUL]], ptr [[ARRAYIDX3]], align 1 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_END_LOOPEXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[FOR_END_LOOPEXIT]]: +; CHECK-NEXT: br label %[[FOR_END]] +; CHECK: [[FOR_END]]: +; CHECK-NEXT: ret void +; +entry: + %val = icmp sgt i64 %n, 0 + br i1 %val, label %for.body, label %for.end + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %src, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %mul = mul i8 %0, 3 + %arrayidx3 = getelementptr inbounds i8, ptr %dst, i64 %indvars.iv + store i8 %mul, ptr %arrayidx3, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" } + +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 16} +!3 = !{!"llvm.loop.interleave.count", i32 2} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]} +;.