diff --git a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h index c85ef3e131068..cb8a771ad8419 100644 --- a/llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ b/llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -491,11 +491,12 @@ struct PointerDiffInfo { const SCEV *SinkStart; unsigned AccessSize; bool NeedsFreeze; + bool WriteAfterRead; PointerDiffInfo(const SCEV *SrcStart, const SCEV *SinkStart, - unsigned AccessSize, bool NeedsFreeze) + unsigned AccessSize, bool NeedsFreeze, bool WriteAfterRead) : SrcStart(SrcStart), SinkStart(SinkStart), AccessSize(AccessSize), - NeedsFreeze(NeedsFreeze) {} + NeedsFreeze(NeedsFreeze), WriteAfterRead(WriteAfterRead) {} }; /// Holds information about the memory runtime legality checks to verify diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 0f17312b03827..760bdb175d942 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -205,6 +205,13 @@ enum class TailFoldingStyle { DataWithEVL, }; +enum class RTCheckStyle { + /// Create runtime checks based on the difference between two pointers + ScalarDifference, + /// Form a mask based on elements which won't be a WAR or RAW hazard. + UseSafeEltsMask, +}; + struct TailFoldingInfo { TargetLibraryInfo *TLI; LoopVectorizationLegality *LVL; @@ -1357,6 +1364,11 @@ class TargetTransformInfo { PartialReductionExtendKind OpBExtend, std::optional BinOp, TTI::TargetCostKind CostKind) const; + /// \return true if a mask should be formed that disables lanes that could + /// alias between two pointers. The mask is created by the + /// loop_dependence_{war,raw}_mask intrinsics. + LLVM_ABI bool useSafeEltsMask() const; + /// \return The maximum interleave factor that any transform should try to /// perform for this target. This number depends on the level of parallelism /// and the number of execution units in the CPU. diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index aacb88d2f9684..97021bea3f90a 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -659,6 +659,8 @@ class TargetTransformInfoImplBase { return InstructionCost::getInvalid(); } + virtual bool useSafeEltsMask() const { return false; } + virtual unsigned getMaxInterleaveFactor(ElementCount VF) const { return 1; } virtual InstructionCost getArithmeticInstrCost( diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 5d88e5f54e3d6..eb15fe3dc549e 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -511,11 +511,14 @@ bool RuntimePointerChecking::tryToCreateDiffCheck( } } + bool WriteAfterRead = !Src->IsWritePtr && Sink->IsWritePtr; + LLVM_DEBUG(dbgs() << "LAA: Creating diff runtime check for:\n" << "SrcStart: " << *SrcStartInt << '\n' << "SinkStartInt: " << *SinkStartInt << '\n'); DiffChecks.emplace_back(SrcStartInt, SinkStartInt, AllocSize, - Src->NeedsFreeze || Sink->NeedsFreeze); + Src->NeedsFreeze || Sink->NeedsFreeze, + WriteAfterRead); return true; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 0426ac7e62fab..e82b4c21ca3ed 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -878,6 +878,10 @@ InstructionCost TargetTransformInfo::getPartialReductionCost( BinOp, CostKind); } +bool TargetTransformInfo::useSafeEltsMask() const { + return TTIImpl->useSafeEltsMask(); +} + unsigned TargetTransformInfo::getMaxInterleaveFactor(ElementCount VF) const { return TTIImpl->getMaxInterleaveFactor(VF); } diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 977ed59e09243..d08655c756cc6 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -175,6 +175,9 @@ bool llvm::isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, return (ScalarOpdIdx == 2); case Intrinsic::experimental_vp_splice: return ScalarOpdIdx == 2 || ScalarOpdIdx == 4; + case Intrinsic::loop_dependence_war_mask: + case Intrinsic::loop_dependence_raw_mask: + return true; default: return false; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index c4bd3c7803c1a..d9d8a419414ad 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -5880,6 +5880,11 @@ InstructionCost AArch64TTIImpl::getPartialReductionCost( return Cost + 2; } +bool AArch64TTIImpl::useSafeEltsMask() const { + // The whilewr/rw instructions require SVE2 + return ST->hasSVE2() || ST->hasSME(); +} + InstructionCost AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef Mask, diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index e62fdb6786843..74689fcfe53c0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -406,6 +406,8 @@ class AArch64TTIImpl final : public BasicTTIImplBase { TTI::PartialReductionExtendKind OpBExtend, std::optional BinOp, TTI::TargetCostKind CostKind) const override; + bool useSafeEltsMask() const override; + bool enableOrderedReductions() const override { return true; } InstructionCost getInterleavedMemoryOpCost( diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 6e60b94be78e3..78a60818ab966 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -2149,7 +2149,7 @@ Value *llvm::addDiffRuntimeChecks( // Map to keep track of created compares, The key is the pair of operands for // the compare, to allow detecting and re-using redundant compares. DenseMap, Value *> SeenCompares; - for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze] : Checks) { + for (const auto &[SrcStart, SinkStart, AccessSize, NeedsFreeze, _] : Checks) { Type *Ty = SinkStart->getType(); // Compute VF * IC * AccessSize. auto *VFTimesICTimesSize = diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 345bc63081b81..912214aa784d1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -169,6 +169,7 @@ STATISTIC(LoopsVectorized, "Number of loops vectorized"); STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized"); +STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask"); static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, @@ -1333,6 +1334,12 @@ class LoopVectorizationCostModel { : ChosenTailFoldingStyle->second; } + RTCheckStyle getRTCheckStyle(const TargetTransformInfo &TTI) const { + if (TTI.useSafeEltsMask()) + return RTCheckStyle::UseSafeEltsMask; + return RTCheckStyle::ScalarDifference; + } + /// Selects and saves TailFoldingStyle for 2 options - if IV update may /// overflow or not. /// \param IsScalableVF true if scalable vector factors enabled. @@ -8554,6 +8561,7 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( bool ForControlFlow = useActiveLaneMaskForControlFlow(Style); bool WithoutRuntimeCheck = Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck; + VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow, WithoutRuntimeCheck); } @@ -8974,11 +8982,104 @@ void LoopVectorizationPlanner::attachRuntimeChecks( assert((!CM.OptForSize || CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); - VPlanTransforms::attachCheckBlock(Plan, SCEVCheckCond, SCEVCheckBlock, + VPlanTransforms::attachCheckBlock(Plan, Plan.getOrAddLiveIn(SCEVCheckCond), + Plan.createVPIRBasicBlock(SCEVCheckBlock), HasBranchWeights); } const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks(); if (MemCheckBlock && MemCheckBlock->hasNPredecessors(0)) { + VPValue *MemCheckCondVPV = Plan.getOrAddLiveIn(MemCheckCond); + VPBasicBlock *MemCheckBlockVP = Plan.createVPIRBasicBlock(MemCheckBlock); + std::optional> ChecksOpt = + CM.Legal->getRuntimePointerChecking()->getDiffChecks(); + + // Create a mask enabling safe elements for each iteration. + if (CM.getRTCheckStyle(TTI) == RTCheckStyle::UseSafeEltsMask && + ChecksOpt.has_value() && ChecksOpt->size() > 0) { + ArrayRef Checks = *ChecksOpt; + VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *LoopBody = LoopRegion->getEntryBasicBlock(); + VPBuilder Builder(MemCheckBlockVP); + + /// Create a mask for each possibly-aliasing pointer pair, ANDing them if + /// there's more than one pair. + VPValue *AliasMask = nullptr; + for (PointerDiffInfo Check : Checks) { + VPValue *Sink = + vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SinkStart); + VPValue *Src = + vputils::getOrCreateVPValueForSCEVExpr(Plan, Check.SrcStart); + + Type *PtrType = PointerType::getUnqual(Plan.getContext()); + Sink = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Sink, + PtrType, DebugLoc()); + Src = Builder.createScalarCast(Instruction::CastOps::IntToPtr, Src, + PtrType, DebugLoc()); + + SmallVector Ops{ + Src, Sink, + Plan.getConstantInt(IntegerType::getInt64Ty(Plan.getContext()), + Check.AccessSize)}; + VPWidenIntrinsicRecipe *M = new VPWidenIntrinsicRecipe( + Check.WriteAfterRead ? Intrinsic::loop_dependence_war_mask + : Intrinsic::loop_dependence_raw_mask, + Ops, IntegerType::getInt1Ty(Plan.getContext())); + MemCheckBlockVP->appendRecipe(M); + if (AliasMask) + AliasMask = Builder.createAnd(AliasMask, M); + else + AliasMask = M; + } + assert(AliasMask && "Expected an alias mask to have been created"); + + // Replace uses of the loop body's active lane mask phi with an AND of the + // phi and the alias mask. + for (VPRecipeBase &R : *LoopBody) { + auto *MaskPhi = dyn_cast(&R); + if (!MaskPhi) + continue; + VPInstruction *And = new VPInstruction(Instruction::BinaryOps::And, + {MaskPhi, AliasMask}); + MaskPhi->replaceUsesWithIf(And, [And](VPUser &U, unsigned) { + auto *UR = dyn_cast(&U); + // If this is the first user, insert the AND. + if (UR && !And->getParent()) + And->insertBefore(UR); + bool Replace = UR != And; + return Replace; + }); + } + + // An empty mask would cause an infinite loop since the induction variable + // is updated with the number of set elements in the mask. Make sure we + // don't execute the vector loop when the mask is empty. + VPInstruction *PopCount = + new VPInstruction(VPInstruction::PopCount, {AliasMask}); + PopCount->insertAfter(AliasMask->getDefiningRecipe()); + VPValue *Cmp = + Builder.createICmp(CmpInst::Predicate::ICMP_EQ, PopCount, + Plan.getOrAddLiveIn(ConstantInt::get( + IntegerType::get(Plan.getContext(), 64), 0))); + MemCheckCondVPV = Cmp; + + // Update the IV by the number of active lanes in the mask. + auto *CanonicalIVPHI = LoopRegion->getCanonicalIV(); + auto *CanonicalIVIncrement = + cast(CanonicalIVPHI->getBackedgeValue()); + + // Increment phi by correct amount. + VPValue *IncrementBy = PopCount; + Type *IVType = CanonicalIVPHI->getScalarType(); + + if (IVType->getScalarSizeInBits() < 64) { + Builder.setInsertPoint(CanonicalIVIncrement); + IncrementBy = + Builder.createScalarCast(Instruction::Trunc, IncrementBy, IVType, + CanonicalIVIncrement->getDebugLoc()); + } + CanonicalIVIncrement->setOperand(1, IncrementBy); + } + // VPlan-native path does not do any analysis for runtime checks // currently. assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) && @@ -8999,7 +9100,7 @@ void LoopVectorizationPlanner::attachRuntimeChecks( "(e.g., adding 'restrict')."; }); } - VPlanTransforms::attachCheckBlock(Plan, MemCheckCond, MemCheckBlock, + VPlanTransforms::attachCheckBlock(Plan, MemCheckCondVPV, MemCheckBlockVP, HasBranchWeights); } } @@ -9966,6 +10067,8 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. if (VF.Width.isVector() || SelectedIC > 1) { + if (CM.getRTCheckStyle(*TTI) == RTCheckStyle::UseSafeEltsMask) + LoopsAliasMasked++; Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); // Bail out early if either the SCEV or memory runtime checks are known to diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 3840b464e6c2c..1ef2fd3b5b323 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1030,6 +1030,7 @@ class LLVM_ABI_FOR_TEST VPInstruction : public VPRecipeWithIRFlags, // during unrolling. ExtractPenultimateElement, LogicalAnd, // Non-poison propagating logical And. + PopCount, // Add an offset in bytes (second operand) to a base pointer (first // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 80a2e4bc3f754..da9380a91cbdf 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -137,6 +137,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) { case VPInstruction::BranchOnCond: case VPInstruction::BranchOnCount: return Type::getVoidTy(Ctx); + case VPInstruction::PopCount: + return Type::getInt64Ty(Ctx); default: break; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp index 663e31a499b01..1a0ed7eebcd06 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp @@ -640,11 +640,9 @@ void VPlanTransforms::createLoopRegions(VPlan &Plan) { // including memory overlap checks block and wrapping/unit-stride checks block. static constexpr uint32_t CheckBypassWeights[] = {1, 127}; -void VPlanTransforms::attachCheckBlock(VPlan &Plan, Value *Cond, - BasicBlock *CheckBlock, +void VPlanTransforms::attachCheckBlock(VPlan &Plan, VPValue *CondVPV, + VPBasicBlock *CheckBlockVPBB, bool AddBranchWeights) { - VPValue *CondVPV = Plan.getOrAddLiveIn(Cond); - VPBasicBlock *CheckBlockVPBB = Plan.createVPIRBasicBlock(CheckBlock); VPBlockBase *VectorPH = Plan.getVectorPreheader(); VPBlockBase *ScalarPH = Plan.getScalarPreheader(); VPBlockBase *PreVectorPH = VectorPH->getSinglePredecessor(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ba145ffa0b681..fcd1612ab20b0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -602,6 +602,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const { case VPInstruction::ExplicitVectorLength: case VPInstruction::AnyOf: case VPInstruction::Not: + case VPInstruction::PopCount: return true; default: return false; @@ -702,6 +703,29 @@ Value *VPInstruction::generate(VPTransformState &State) { {PredTy, ScalarTC->getType()}, {VIVElem0, ScalarTC}, nullptr, Name); } + // Count the number of bits set in each lane and reduce the result to a scalar + case VPInstruction::PopCount: { + Value *Op = State.get(getOperand(0)); + Type *VT = Op->getType(); + Value *Cnt = Op; + + // i1 vectors can just use the add reduction. Bigger elements need a ctpop + // first. + if (VT->getScalarSizeInBits() > 1) + Cnt = Builder.CreateIntrinsic(Intrinsic::ctpop, {VT}, {Cnt}); + + auto *VecVT = cast(VT); + // Extend to an i8 since i1 is too small to add with + if (VecVT->getElementType()->getScalarSizeInBits() < 8) { + Cnt = Builder.CreateCast( + Instruction::ZExt, Cnt, + VectorType::get(Builder.getInt8Ty(), VecVT->getElementCount())); + } + + Cnt = Builder.CreateUnaryIntrinsic(Intrinsic::vector_reduce_add, Cnt); + Cnt = Builder.CreateCast(Instruction::ZExt, Cnt, Builder.getInt64Ty()); + return Cnt; + } case VPInstruction::FirstOrderRecurrenceSplice: { // Generate code to combine the previous and current values in vector v3. // @@ -1214,7 +1238,7 @@ bool VPInstruction::isVectorToScalar() const { getOpcode() == VPInstruction::ComputeAnyOfResult || getOpcode() == VPInstruction::ComputeFindIVResult || getOpcode() == VPInstruction::ComputeReductionResult || - getOpcode() == VPInstruction::AnyOf; + getOpcode() == VPInstruction::AnyOf || getOpcode() == PopCount; } bool VPInstruction::isSingleScalar() const { @@ -1389,6 +1413,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; + case VPInstruction::PopCount: + O << "popcount"; + break; case VPInstruction::ExplicitVectorLength: O << "EXPLICIT-VECTOR-LENGTH"; break; @@ -4316,7 +4343,9 @@ void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent, getOperand(4)->printAsOperand(O, SlotTracker); } } +#endif +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "EMIT "; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 10afd006c90c9..0257e343b6a0f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2385,7 +2385,7 @@ void VPlanTransforms::optimize(VPlan &Plan) { // %Negated = Not %ALM // branch-on-cond %Negated // -static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( +static VPSingleDefRecipe *addVPLaneMaskPhiAndUpdateExitBranch( VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) { VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); @@ -2508,6 +2508,7 @@ static VPSingleDefRecipe *findHeaderMask(VPlan &Plan) { void VPlanTransforms::addActiveLaneMask( VPlan &Plan, bool UseActiveLaneMaskForControlFlow, bool DataAndControlFlowWithoutRuntimeCheck) { + assert((!DataAndControlFlowWithoutRuntimeCheck || UseActiveLaneMaskForControlFlow) && "DataAndControlFlowWithoutRuntimeCheck implies " diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 34850743e7b62..d1c868f792f60 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -129,10 +129,10 @@ struct VPlanTransforms { /// flat CFG into a hierarchical CFG. LLVM_ABI_FOR_TEST static void createLoopRegions(VPlan &Plan); - /// Wrap runtime check block \p CheckBlock in a VPIRBB and \p Cond in a - /// VPValue and connect the block to \p Plan, using the VPValue as branch + /// Connect \p CheckBlockVPBB to \p Plan, using the \p CondVPV as branch /// condition. - static void attachCheckBlock(VPlan &Plan, Value *Cond, BasicBlock *CheckBlock, + static void attachCheckBlock(VPlan &Plan, VPValue *CondVPV, + VPBasicBlock *CheckBlockVPBB, bool AddBranchWeights); /// Replaces the VPInstructions in \p Plan with corresponding diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp index e22c5dfdb9f38..90f3e1774abb7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp @@ -42,7 +42,9 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) { if (U && !isa(U->getValue())) return Plan.getOrAddLiveIn(U->getValue()); auto *Expanded = new VPExpandSCEVRecipe(Expr); - Plan.getEntry()->appendRecipe(Expanded); + VPRecipeBase *ExpandedR = Expanded->getDefiningRecipe(); + + ExpandedR->insertBefore(*Plan.getEntry(), Plan.getEntry()->getFirstNonPhi()); return Expanded; } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll new file mode 100644 index 0000000000000..e641a415ac1d5 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/alias_mask.ll @@ -0,0 +1,320 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^middle.block:" --filter-out-after "^scalar.ph:" --version 4 +; RUN: opt -S -mtriple=aarch64-unknown-linux-gnu -mattr=+sve2 -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -force-vector-interleave=1 %s | FileCheck %s + +define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define dso_local void @alias_mask( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[C3:%.*]] = ptrtoint ptr [[C]] to i64 +; CHECK-NEXT: [[B2:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoint ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP19:%.*]] = mul nuw i64 [[TMP5]], 16 +; CHECK-NEXT: [[TMP22:%.*]] = sub i64 [[C1]], [[B2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP22]], [[TMP19]] +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[C3]] to ptr +; CHECK-NEXT: [[TMP27:%.*]] = inttoptr i64 [[B2]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP27]], ptr [[TMP4]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8( [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP9]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP14]] +; CHECK-NEXT: [[TMP17:%.*]] = icmp ugt i64 [[N]], [[TMP14]] +; CHECK-NEXT: [[TMP13:%.*]] = select i1 [[TMP17]], i64 [[TMP15]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP16]], [[TMP25]], poison) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP18]], [[TMP25]], poison) +; CHECK-NEXT: [[TMP20:%.*]] = add [[WIDE_MASKED_LOAD5]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP20]], ptr align 1 [[TMP21]], [[TMP25]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP13]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = xor i1 [[TMP23]], true +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %add = add i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %add, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + +define i32 @alias_mask_read_after_write(ptr noalias %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define i32 @alias_mask_read_after_write( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[C2:%.*]] = ptrtoint ptr [[C]] to i64 +; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP13]], 4 +; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP20]], 4 +; CHECK-NEXT: [[TMP29:%.*]] = sub i64 [[B1]], [[C2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP29]], [[TMP22]] +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-NEXT: [[TMP4:%.*]] = inttoptr i64 [[C2]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP4]], ptr [[TMP5]], i64 4) +; CHECK-NEXT: [[TMP0:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8( [[TMP0]]) +; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[TMP2]], 0 +; CHECK-NEXT: br i1 [[TMP10]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl nuw i64 [[TMP9]], 2 +; CHECK-NEXT: [[TMP15:%.*]] = sub i64 [[N]], [[TMP18]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp ugt i64 [[N]], [[TMP18]] +; CHECK-NEXT: [[TMP14:%.*]] = select i1 [[TMP16]], i64 [[TMP15]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP31:%.*]] = and [[ACTIVE_LANE_MASK]], [[ALIAS_LANE_MASK]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP17]], [[TMP31]], poison) +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP19]], [[TMP31]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP21]], [[TMP31]], poison) +; CHECK-NEXT: [[TMP23:%.*]] = add [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP23]], [[WIDE_MASKED_LOAD5]] +; CHECK-NEXT: [[TMP25]] = select [[TMP31]], [[TMP24]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP14]]) +; CHECK-NEXT: [[TMP28:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP26:%.*]] = xor i1 [[TMP28]], true +; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; +entry: + %cmp19 = icmp sgt i64 %n, 0 + br i1 %cmp19, label %for.body, label %exit + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ] + %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv + %load.a = load i32, ptr %gep.a, align 2 + %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv + store i32 %load.a, ptr %gep.c, align 2 + %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv + %load.b = load i32, ptr %gep.b, align 2 + %add = add i32 %load.a, %accum + %add2 = add i32 %add, %load.b + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %entry, %for.body + %result = phi i32 [ 0, %entry ], [ %add2, %for.body ] + ret i32 %result +} + +define dso_local void @alias_mask_multiple(ptr %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define dso_local void @alias_mask_multiple( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[C5:%.*]] = ptrtoint ptr [[C]] to i64 +; CHECK-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoint ptr [[C]] to i64 +; CHECK-NEXT: [[CMP11:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP32:%.*]] = mul nuw i64 [[TMP31]], 16 +; CHECK-NEXT: [[TMP33:%.*]] = sub i64 [[C1]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP33]], [[TMP32]] +; CHECK-NEXT: [[TMP34:%.*]] = sub i64 [[C1]], [[B3]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP34]], [[TMP32]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr i64 [[C5]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = inttoptr i64 [[A6]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP35]], ptr [[TMP5]], i64 1) +; CHECK-NEXT: [[TMP36:%.*]] = inttoptr i64 [[C5]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK4:%.*]] = call @llvm.loop.dependence.war.mask.nxv16i1(ptr [[TMP6]], ptr [[TMP36]], i64 1) +; CHECK-NEXT: [[TMP0:%.*]] = and [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK4]] +; CHECK-NEXT: [[TMP1:%.*]] = zext [[TMP0]] to +; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.nxv16i8( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 4 +; CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP24:%.*]] = and [[ACTIVE_LANE_MASK]], [[TMP0]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP25]], [[TMP24]], poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP26]], [[TMP24]], poison) +; CHECK-NEXT: [[TMP27:%.*]] = add [[WIDE_MASKED_LOAD14]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP27]], ptr align 1 [[TMP28]], [[TMP24]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP23]]) +; CHECK-NEXT: [[TMP29:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = xor i1 [[TMP29]], true +; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; +entry: + %cmp11 = icmp sgt i64 %n, 0 + br i1 %cmp11, label %for.body, label %exit + +for.body: ; preds = %for.body.preheader, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %gep.b = getelementptr inbounds i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %add = add i8 %load.b, %load.a + %gep.c = getelementptr inbounds i8, ptr %c, i64 %iv + store i8 %add, ptr %gep.c, align 1 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret void +} + +define i32 @alias_mask_multiple_read_after_write(ptr %a, ptr %b, ptr %c, i64 %n) { +; CHECK-LABEL: define i32 @alias_mask_multiple_read_after_write( +; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]], ptr [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[B7:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[C5:%.*]] = ptrtoint ptr [[C]] to i64 +; CHECK-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 +; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-NEXT: [[C1:%.*]] = ptrtoint ptr [[C]] to i64 +; CHECK-NEXT: [[CMP19:%.*]] = icmp sgt i64 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP19]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; CHECK: vector.memcheck: +; CHECK-NEXT: [[TMP33:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP34:%.*]] = mul nuw i64 [[TMP33]], 4 +; CHECK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[TMP36:%.*]] = sub i64 [[C1]], [[A2]] +; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP36]], [[TMP35]] +; CHECK-NEXT: [[TMP37:%.*]] = mul i64 [[TMP34]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[B3]], [[C1]] +; CHECK-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP5]], [[TMP37]] +; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] +; CHECK-NEXT: [[TMP38:%.*]] = inttoptr i64 [[C5]] to ptr +; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[A6]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP6]], ptr [[TMP38]], i64 4) +; CHECK-NEXT: [[TMP40:%.*]] = inttoptr i64 [[B7]] to ptr +; CHECK-NEXT: [[TMP39:%.*]] = inttoptr i64 [[C5]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK4:%.*]] = call @llvm.loop.dependence.raw.mask.nxv4i1(ptr [[TMP39]], ptr [[TMP40]], i64 4) +; CHECK-NEXT: [[TMP0:%.*]] = and [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK4]] +; CHECK-NEXT: [[TMP1:%.*]] = zext [[TMP0]] to +; CHECK-NEXT: [[TMP2:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8( [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = zext i8 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[TMP3]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP19]], 2 +; CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[N]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = icmp ugt i64 [[N]], [[TMP20]] +; CHECK-NEXT: [[TMP23:%.*]] = select i1 [[TMP22]], i64 [[TMP21]], i64 0 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP24:%.*]] = and [[ACTIVE_LANE_MASK]], [[TMP0]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP25]], [[TMP24]], poison) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[WIDE_MASKED_LOAD]], ptr align 2 [[TMP26]], [[TMP24]]) +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr align 2 [[TMP27]], [[TMP24]], poison) +; CHECK-NEXT: [[TMP28:%.*]] = add [[WIDE_MASKED_LOAD]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP29:%.*]] = add [[TMP28]], [[WIDE_MASKED_LOAD14]] +; CHECK-NEXT: [[TMP30]] = select [[TMP24]], [[TMP29]], [[VEC_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP3]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP23]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-NEXT: [[TMP32:%.*]] = xor i1 [[TMP31]], true +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: middle.block: +; +entry: + %cmp19 = icmp sgt i64 %n, 0 + br i1 %cmp19, label %for.body, label %exit + +for.body: ; preds = %entry, %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add2, %for.body ] + %gep.a = getelementptr inbounds i32, ptr %a, i64 %iv + %load.a = load i32, ptr %gep.a, align 2 + %gep.c = getelementptr inbounds i32, ptr %c, i64 %iv + store i32 %load.a, ptr %gep.c, align 2 + %gep.b = getelementptr inbounds i32, ptr %b, i64 %iv + %load.b = load i32, ptr %gep.b, align 2 + %add = add i32 %load.a, %accum + %add2 = add i32 %add, %load.b + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %entry, %for.body + %result = phi i32 [ 0, %entry ], [ %add2, %for.body ] + ret i32 %result +} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll index 9b4151f30d640..e5a093d54e091 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll @@ -5,7 +5,7 @@ ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a14 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a15 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s ; RUN: opt -passes=loop-vectorize -mtriple=arm64-apple-macos -mcpu=apple-a16 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s -; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4 %s +; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v2 -S %s | FileCheck --check-prefix=INTERLEAVE-4-SVE %s ; RUN: opt -passes=loop-vectorize -mtriple=arm64 -mcpu=neoverse-v3 -S %s | FileCheck --check-prefix=INTERLEAVE-4-VLA %s ; Tests for selecting interleave counts for loops with loads and stores. @@ -70,7 +70,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; INTERLEAVE-4: vec.epilog.iter.check: ; INTERLEAVE-4-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 -; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; INTERLEAVE-4-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; INTERLEAVE-4: vec.epilog.ph: ; INTERLEAVE-4-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; INTERLEAVE-4-NEXT: [[N_MOD_VF9:%.*]] = urem i64 [[N]], 8 @@ -91,7 +91,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-4-NEXT: store <8 x i8> [[TMP39]], ptr [[TMP40]], align 1 ; INTERLEAVE-4-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX12]], 8 ; INTERLEAVE-4-NEXT: [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC10]] -; INTERLEAVE-4-NEXT: br i1 [[TMP42]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE-4-NEXT: br i1 [[TMP42]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; INTERLEAVE-4: vec.epilog.middle.block: ; INTERLEAVE-4-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[N_VEC10]] ; INTERLEAVE-4-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -109,7 +109,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-4-NEXT: store i8 [[SEL]], ptr [[GEP_DST]], align 1 ; INTERLEAVE-4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; INTERLEAVE-4-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; INTERLEAVE-4-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; INTERLEAVE-4-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; INTERLEAVE-4: exit: ; INTERLEAVE-4-NEXT: ret void ; @@ -158,7 +158,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; INTERLEAVE-2: vec.epilog.iter.check: ; INTERLEAVE-2-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 -; INTERLEAVE-2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; INTERLEAVE-2-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; INTERLEAVE-2: vec.epilog.ph: ; INTERLEAVE-2-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; INTERLEAVE-2-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[N]], 8 @@ -179,7 +179,7 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2-NEXT: store <8 x i8> [[TMP23]], ptr [[TMP24]], align 1 ; INTERLEAVE-2-NEXT: [[INDEX_NEXT16]] = add nuw i64 [[INDEX10]], 8 ; INTERLEAVE-2-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT16]], [[N_VEC8]] -; INTERLEAVE-2-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; INTERLEAVE-2-NEXT: br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; INTERLEAVE-2: vec.epilog.middle.block: ; INTERLEAVE-2-NEXT: [[CMP_N9:%.*]] = icmp eq i64 [[N]], [[N_VEC8]] ; INTERLEAVE-2-NEXT: br i1 [[CMP_N9]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -197,10 +197,120 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2-NEXT: store i8 [[SEL]], ptr [[GEP_DST]], align 1 ; INTERLEAVE-2-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; INTERLEAVE-2-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; INTERLEAVE-2-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; INTERLEAVE-2-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; INTERLEAVE-2: exit: ; INTERLEAVE-2-NEXT: ret void ; +; INTERLEAVE-4-SVE-LABEL: @interleave_single_load_store( +; INTERLEAVE-4-SVE-NEXT: iter.check: +; INTERLEAVE-4-SVE-NEXT: [[DST3:%.*]] = ptrtoint ptr [[DST:%.*]] to i64 +; INTERLEAVE-4-SVE-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC:%.*]] to i64 +; INTERLEAVE-4-SVE-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 +; INTERLEAVE-4-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N:%.*]], 8 +; INTERLEAVE-4-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] +; INTERLEAVE-4-SVE: vector.memcheck: +; INTERLEAVE-4-SVE-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]] +; INTERLEAVE-4-SVE-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 64 +; INTERLEAVE-4-SVE-NEXT: [[TMP2:%.*]] = inttoptr i64 [[DST3]] to ptr +; INTERLEAVE-4-SVE-NEXT: [[TMP1:%.*]] = inttoptr i64 [[SRC2]] to ptr +; INTERLEAVE-4-SVE-NEXT: [[ALIAS_LANE_MASK:%.*]] = call <16 x i1> @llvm.loop.dependence.war.mask.v16i1(ptr [[TMP1]], ptr [[TMP2]], i64 1) +; INTERLEAVE-4-SVE-NEXT: [[TMP3:%.*]] = zext <16 x i1> [[ALIAS_LANE_MASK]] to <16 x i8> +; INTERLEAVE-4-SVE-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP3]]) +; INTERLEAVE-4-SVE-NEXT: [[TMP5:%.*]] = zext i8 [[TMP4]] to i64 +; INTERLEAVE-4-SVE-NEXT: [[TMP7:%.*]] = icmp eq i64 [[TMP5]], 0 +; INTERLEAVE-4-SVE-NEXT: br i1 [[TMP7]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; INTERLEAVE-4-SVE: vector.main.loop.iter.check: +; INTERLEAVE-4-SVE-NEXT: [[MIN_ITERS_CHECK4:%.*]] = icmp ult i64 [[N]], 64 +; INTERLEAVE-4-SVE-NEXT: br i1 [[MIN_ITERS_CHECK4]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; INTERLEAVE-4-SVE: vector.ph: +; INTERLEAVE-4-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 64 +; INTERLEAVE-4-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[B:%.*]], i64 0 +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <16 x i8> poison, i8 [[A:%.*]], i64 0 +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT5]], <16 x i8> poison, <16 x i32> zeroinitializer +; INTERLEAVE-4-SVE-NEXT: br label [[VECTOR_BODY:%.*]] +; INTERLEAVE-4-SVE: vector.body: +; INTERLEAVE-4-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; INTERLEAVE-4-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; INTERLEAVE-4-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16 +; INTERLEAVE-4-SVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 32 +; INTERLEAVE-4-SVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 48 +; INTERLEAVE-4-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[TMP12:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; INTERLEAVE-4-SVE-NEXT: [[TMP13:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD7]], [[BROADCAST_SPLAT]] +; INTERLEAVE-4-SVE-NEXT: [[TMP14:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]] +; INTERLEAVE-4-SVE-NEXT: [[TMP15:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD9]], [[BROADCAST_SPLAT]] +; INTERLEAVE-4-SVE-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT6]]) +; INTERLEAVE-4-SVE-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD7]], <16 x i8> [[BROADCAST_SPLAT6]]) +; INTERLEAVE-4-SVE-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD8]], <16 x i8> [[BROADCAST_SPLAT6]]) +; INTERLEAVE-4-SVE-NEXT: [[TMP19:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[WIDE_LOAD9]], <16 x i8> [[BROADCAST_SPLAT6]]) +; INTERLEAVE-4-SVE-NEXT: [[TMP20:%.*]] = select <16 x i1> [[TMP12]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP16]] +; INTERLEAVE-4-SVE-NEXT: [[TMP21:%.*]] = select <16 x i1> [[TMP13]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP17]] +; INTERLEAVE-4-SVE-NEXT: [[TMP22:%.*]] = select <16 x i1> [[TMP14]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP18]] +; INTERLEAVE-4-SVE-NEXT: [[TMP23:%.*]] = select <16 x i1> [[TMP15]], <16 x i8> [[BROADCAST_SPLAT]], <16 x i8> [[TMP19]] +; INTERLEAVE-4-SVE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; INTERLEAVE-4-SVE-NEXT: [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 16 +; INTERLEAVE-4-SVE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 32 +; INTERLEAVE-4-SVE-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP24]], i32 48 +; INTERLEAVE-4-SVE-NEXT: store <16 x i8> [[TMP20]], ptr [[TMP24]], align 1 +; INTERLEAVE-4-SVE-NEXT: store <16 x i8> [[TMP21]], ptr [[TMP25]], align 1 +; INTERLEAVE-4-SVE-NEXT: store <16 x i8> [[TMP22]], ptr [[TMP26]], align 1 +; INTERLEAVE-4-SVE-NEXT: store <16 x i8> [[TMP23]], ptr [[TMP27]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; INTERLEAVE-4-SVE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; INTERLEAVE-4-SVE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; INTERLEAVE-4-SVE: middle.block: +; INTERLEAVE-4-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] +; INTERLEAVE-4-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; INTERLEAVE-4-SVE: vec.epilog.iter.check: +; INTERLEAVE-4-SVE-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 +; INTERLEAVE-4-SVE-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] +; INTERLEAVE-4-SVE: vec.epilog.ph: +; INTERLEAVE-4-SVE-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; INTERLEAVE-4-SVE-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[N]], 8 +; INTERLEAVE-4-SVE-NEXT: [[N_VEC11:%.*]] = sub i64 [[N]], [[N_MOD_VF10]] +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLAT13:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT12]], <8 x i8> poison, <8 x i32> zeroinitializer +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <8 x i8> poison, i8 [[A]], i64 0 +; INTERLEAVE-4-SVE-NEXT: [[BROADCAST_SPLAT15:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT14]], <8 x i8> poison, <8 x i32> zeroinitializer +; INTERLEAVE-4-SVE-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; INTERLEAVE-4-SVE: vec.epilog.vector.body: +; INTERLEAVE-4-SVE-NEXT: [[INDEX16:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; INTERLEAVE-4-SVE-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX16]] +; INTERLEAVE-4-SVE-NEXT: [[WIDE_LOAD17:%.*]] = load <8 x i8>, ptr [[TMP29]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[TMP30:%.*]] = icmp sgt <8 x i8> [[WIDE_LOAD17]], [[BROADCAST_SPLAT13]] +; INTERLEAVE-4-SVE-NEXT: [[TMP31:%.*]] = call <8 x i8> @llvm.smax.v8i8(<8 x i8> [[WIDE_LOAD17]], <8 x i8> [[BROADCAST_SPLAT15]]) +; INTERLEAVE-4-SVE-NEXT: [[TMP32:%.*]] = select <8 x i1> [[TMP30]], <8 x i8> [[BROADCAST_SPLAT13]], <8 x i8> [[TMP31]] +; INTERLEAVE-4-SVE-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX16]] +; INTERLEAVE-4-SVE-NEXT: store <8 x i8> [[TMP32]], ptr [[TMP33]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX16]], 8 +; INTERLEAVE-4-SVE-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT18]], [[N_VEC11]] +; INTERLEAVE-4-SVE-NEXT: br i1 [[TMP34]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; INTERLEAVE-4-SVE: vec.epilog.middle.block: +; INTERLEAVE-4-SVE-NEXT: [[CMP_N19:%.*]] = icmp eq i64 [[N]], [[N_VEC11]] +; INTERLEAVE-4-SVE-NEXT: br i1 [[CMP_N19]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] +; INTERLEAVE-4-SVE: vec.epilog.scalar.ph: +; INTERLEAVE-4-SVE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC11]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; INTERLEAVE-4-SVE-NEXT: br label [[LOOP:%.*]] +; INTERLEAVE-4-SVE: loop: +; INTERLEAVE-4-SVE-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; INTERLEAVE-4-SVE-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[IV]] +; INTERLEAVE-4-SVE-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[CMP:%.*]] = icmp sgt i8 [[L]], [[B]] +; INTERLEAVE-4-SVE-NEXT: [[MAX:%.*]] = tail call i8 @llvm.smax.i8(i8 [[L]], i8 [[A]]) +; INTERLEAVE-4-SVE-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 [[B]], i8 [[MAX]] +; INTERLEAVE-4-SVE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; INTERLEAVE-4-SVE-NEXT: store i8 [[SEL]], ptr [[GEP_DST]], align 1 +; INTERLEAVE-4-SVE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 +; INTERLEAVE-4-SVE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] +; INTERLEAVE-4-SVE-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; INTERLEAVE-4-SVE: exit: +; INTERLEAVE-4-SVE-NEXT: ret void +; ; INTERLEAVE-4-VLA-LABEL: @interleave_single_load_store( ; INTERLEAVE-4-VLA: call @llvm.smax.nxv16i8( ; INTERLEAVE-4-VLA-NEXT: call @llvm.smax.nxv16i8( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll index 76a7536501bd6..68595cdaec570 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fixed-width-inorder-core.ll @@ -7,6 +7,8 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA510-LABEL: define void @sve_add( ; CHECK-CA510-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-CA510-NEXT: [[ENTRY:.*:]] +; CHECK-CA510-NEXT: [[A6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-CA510-NEXT: [[DST5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-CA510-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-CA510-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 ; CHECK-CA510-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 @@ -21,7 +23,18 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA510-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] ; CHECK-CA510-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 ; CHECK-CA510-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; CHECK-CA510-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-CA510-NEXT: [[TMP17:%.*]] = inttoptr i64 [[DST5]] to ptr +; CHECK-CA510-NEXT: [[TMP12:%.*]] = inttoptr i64 [[A6]] to ptr +; CHECK-CA510-NEXT: [[ALIAS_LANE_MASK:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP12]], ptr [[TMP17]], i64 4) +; CHECK-CA510-NEXT: [[TMP18:%.*]] = inttoptr i64 [[DST5]] to ptr +; CHECK-CA510-NEXT: [[TMP4:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-CA510-NEXT: [[ALIAS_LANE_MASK7:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP4]], ptr [[TMP18]], i64 4) +; CHECK-CA510-NEXT: [[TMP19:%.*]] = and <4 x i1> [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK7]] +; CHECK-CA510-NEXT: [[TMP7:%.*]] = zext <4 x i1> [[TMP19]] to <4 x i8> +; CHECK-CA510-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP7]]) +; CHECK-CA510-NEXT: [[TMP21:%.*]] = zext i8 [[TMP20]] to i64 +; CHECK-CA510-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 0 +; CHECK-CA510-NEXT: br i1 [[TMP22]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK-CA510: [[VECTOR_PH]]: ; CHECK-CA510-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-CA510-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -42,7 +55,7 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA510-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 ; CHECK-CA510-NEXT: store <4 x float> [[TMP9]], ptr [[TMP11]], align 4 ; CHECK-CA510-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 -; CHECK-CA510-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP2]], 8 +; CHECK-CA510-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP2]], [[TMP21]] ; CHECK-CA510-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-CA510-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-CA510: [[MIDDLE_BLOCK]]: @@ -71,6 +84,8 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA520-LABEL: define void @sve_add( ; CHECK-CA520-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-CA520-NEXT: [[ENTRY:.*:]] +; CHECK-CA520-NEXT: [[A6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-CA520-NEXT: [[DST5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-CA520-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-CA520-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 ; CHECK-CA520-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 @@ -85,7 +100,18 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA520-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] ; CHECK-CA520-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 ; CHECK-CA520-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; CHECK-CA520-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-CA520-NEXT: [[TMP17:%.*]] = inttoptr i64 [[DST5]] to ptr +; CHECK-CA520-NEXT: [[TMP12:%.*]] = inttoptr i64 [[A6]] to ptr +; CHECK-CA520-NEXT: [[ALIAS_LANE_MASK:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP12]], ptr [[TMP17]], i64 4) +; CHECK-CA520-NEXT: [[TMP18:%.*]] = inttoptr i64 [[DST5]] to ptr +; CHECK-CA520-NEXT: [[TMP4:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-CA520-NEXT: [[ALIAS_LANE_MASK7:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP4]], ptr [[TMP18]], i64 4) +; CHECK-CA520-NEXT: [[TMP19:%.*]] = and <4 x i1> [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK7]] +; CHECK-CA520-NEXT: [[TMP7:%.*]] = zext <4 x i1> [[TMP19]] to <4 x i8> +; CHECK-CA520-NEXT: [[TMP20:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP7]]) +; CHECK-CA520-NEXT: [[TMP21:%.*]] = zext i8 [[TMP20]] to i64 +; CHECK-CA520-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 0 +; CHECK-CA520-NEXT: br i1 [[TMP22]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK-CA520: [[VECTOR_PH]]: ; CHECK-CA520-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-CA520-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -106,7 +132,7 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA520-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw float, ptr [[TMP11]], i32 4 ; CHECK-CA520-NEXT: store <4 x float> [[TMP9]], ptr [[TMP11]], align 4 ; CHECK-CA520-NEXT: store <4 x float> [[TMP10]], ptr [[TMP13]], align 4 -; CHECK-CA520-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP2]], 8 +; CHECK-CA520-NEXT: [[INDEX_NEXT]] = add nuw i64 [[TMP2]], [[TMP21]] ; CHECK-CA520-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-CA520-NEXT: br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-CA520: [[MIDDLE_BLOCK]]: @@ -135,6 +161,8 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA320-LABEL: define void @sve_add( ; CHECK-CA320-SAME: ptr [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-CA320-NEXT: [[ENTRY:.*:]] +; CHECK-CA320-NEXT: [[A6:%.*]] = ptrtoint ptr [[A]] to i64 +; CHECK-CA320-NEXT: [[DST5:%.*]] = ptrtoint ptr [[DST]] to i64 ; CHECK-CA320-NEXT: [[B3:%.*]] = ptrtoint ptr [[B]] to i64 ; CHECK-CA320-NEXT: [[A2:%.*]] = ptrtoint ptr [[A]] to i64 ; CHECK-CA320-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 @@ -149,7 +177,18 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA320-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[B3]] ; CHECK-CA320-NEXT: [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP1]], 32 ; CHECK-CA320-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]] -; CHECK-CA320-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] +; CHECK-CA320-NEXT: [[TMP14:%.*]] = inttoptr i64 [[DST5]] to ptr +; CHECK-CA320-NEXT: [[TMP13:%.*]] = inttoptr i64 [[A6]] to ptr +; CHECK-CA320-NEXT: [[ALIAS_LANE_MASK:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP13]], ptr [[TMP14]], i64 4) +; CHECK-CA320-NEXT: [[TMP16:%.*]] = inttoptr i64 [[DST5]] to ptr +; CHECK-CA320-NEXT: [[TMP15:%.*]] = inttoptr i64 [[B3]] to ptr +; CHECK-CA320-NEXT: [[ALIAS_LANE_MASK7:%.*]] = call <4 x i1> @llvm.loop.dependence.war.mask.v4i1(ptr [[TMP15]], ptr [[TMP16]], i64 4) +; CHECK-CA320-NEXT: [[TMP17:%.*]] = and <4 x i1> [[ALIAS_LANE_MASK]], [[ALIAS_LANE_MASK7]] +; CHECK-CA320-NEXT: [[TMP18:%.*]] = zext <4 x i1> [[TMP17]] to <4 x i8> +; CHECK-CA320-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP18]]) +; CHECK-CA320-NEXT: [[TMP20:%.*]] = zext i8 [[TMP19]] to i64 +; CHECK-CA320-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], 0 +; CHECK-CA320-NEXT: br i1 [[TMP21]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] ; CHECK-CA320: [[VECTOR_PH]]: ; CHECK-CA320-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 8 ; CHECK-CA320-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] @@ -170,7 +209,7 @@ define void @sve_add(ptr %dst, ptr %a, ptr %b, i64 %n) { ; CHECK-CA320-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i32 4 ; CHECK-CA320-NEXT: store <4 x float> [[TMP6]], ptr [[TMP8]], align 4 ; CHECK-CA320-NEXT: store <4 x float> [[TMP7]], ptr [[TMP9]], align 4 -; CHECK-CA320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-CA320-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] ; CHECK-CA320-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-CA320-NEXT: br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-CA320: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll index ed49dc5a7573f..122a25efdfa56 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll @@ -13,6 +13,7 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999) ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() @@ -23,9 +24,16 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B1]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[A2]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[B1]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP14]], ptr [[TMP15]], i64 4) +; CHECK-NEXT: [[TMP16:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8( [[TMP16]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i8 [[TMP17]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[TMP11]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 8 @@ -36,19 +44,20 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 2 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP29:%.*]] = shl nuw i64 [[TMP28]], 2 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]] ; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 ; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP30]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP11]] to i32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP26]] ; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -83,6 +92,7 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[A2:%.*]] = ptrtoint ptr [[A:%.*]] to i64 ; CHECK-NEXT: [[B1:%.*]] = ptrtoint ptr [[B:%.*]] to i64 +; CHECK-NEXT: [[A1:%.*]] = ptrtoint ptr [[A]] to i64 ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[N:%.*]], i32 999) ; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i32 [[UMIN]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.vscale.i32() @@ -93,9 +103,16 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[B1]], [[A2]] +; CHECK-NEXT: [[TMP6:%.*]] = sub i64 [[A1]], [[B1]] ; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]] -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP15:%.*]] = inttoptr i64 [[A2]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = inttoptr i64 [[B1]] to ptr +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv4i1(ptr [[TMP14]], ptr [[TMP15]], i64 4) +; CHECK-NEXT: [[TMP16:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP17:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8( [[TMP16]]) +; CHECK-NEXT: [[TMP11:%.*]] = zext i8 [[TMP17]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[TMP11]], 0 +; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP8:%.*]] = mul nuw i32 [[TMP7]], 8 @@ -106,19 +123,20 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 2 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[TMP23]] ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP24]], align 4 -; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDEX]] ; CHECK-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP29:%.*]] = shl nuw i64 [[TMP28]], 2 ; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP25]], i64 [[TMP29]] ; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 ; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP30]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[TMP26:%.*]] = trunc i64 [[TMP11]] to i32 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP26]] ; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 871d9be609bd7..1ba4f5d4b6462 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -597,13 +597,12 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] ; CHECK: vector.memcheck: -; CHECK-NEXT: [[ARRAY1:%.*]] = ptrtoint ptr [[ARRAY]] to i64 -; CHECK-NEXT: [[INDICES2:%.*]] = ptrtoint ptr [[INDICES]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 -; CHECK-NEXT: [[TMP5:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]] -; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]] -; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[ALIAS_LANE_MASK:%.*]] = call @llvm.loop.dependence.war.mask.nxv4i1(ptr [[INDICES]], ptr [[ARRAY]], i64 4) +; CHECK-NEXT: [[TMP10:%.*]] = zext [[ALIAS_LANE_MASK]] to +; CHECK-NEXT: [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.nxv4i8( [[TMP10]]) +; CHECK-NEXT: [[TMP17:%.*]] = zext i8 [[TMP6]] to i64 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i8 [[TMP6]], 0 +; CHECK-NEXT: br i1 [[TMP19]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 @@ -624,7 +623,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr % ; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i32( [[TMP14]], i32 1, splat (i1 true)) ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX]] ; CHECK-NEXT: store [[VEC_IND]], ptr [[TMP15]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]] ; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll new file mode 100644 index 0000000000000..0b0c33193ba35 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing-predicated.ll @@ -0,0 +1,98 @@ +; REQUIRES: asserts +; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize -mattr=+sve2 -force-vector-width=4 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -force-tail-folding-style=data-and-control -disable-output %s 2>&1 | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +; Tests for printing predicated VPlans. + +define dso_local void @alias_mask(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: 'alias_mask' +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%wide.trip.count> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %wide.trip.count = zext nneg i32 %n to i64 +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %0 = sub i64 %c1, %b2 +; CHECK-NEXT: IR %diff.check = icmp ult i64 %0, 4 +; CHECK-NEXT: EMIT-SCALAR vp<[[PTRC:%.+]]> = inttoptr ir<%c3> to ptr +; CHECK-NEXT: EMIT-SCALAR vp<[[PTRB:%.+]]> = inttoptr ir<%b2> to ptr +; CHECK-NEXT: WIDEN-INTRINSIC vp<[[ALIAS_MASK:%.+]]> = call llvm.loop.dependence.war.mask(vp<[[PTRB]]>, vp<[[PTRC]]>, ir<1>) +; CHECK-NEXT: EMIT vp<[[POPCOUNT:%.+]]> = popcount vp<[[ALIAS_MASK]]> +; CHECK-NEXT: EMIT vp<[[COND:%.+]]> = icmp eq vp<[[POPCOUNT]]>, ir<0> +; CHECK-NEXT: EMIT branch-on-cond vp<[[COND]]> +; CHECK-NEXT: Successor(s): ir-bb, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%active.lane.mask.entry> = active lane mask ir<0>, ir<%wide.trip.count>, ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: ACTIVE-LANE-MASK-PHI vp<[[ACTIVE_LANE_MASK:%.+]]> = phi vp<%active.lane.mask.entry>, vp<%active.lane.mask.next> +; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<%index> +; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = and vp<[[ACTIVE_LANE_MASK]]>, vp<[[ALIAS_MASK]]> +; CHECK-NEXT: WIDEN ir<%1> = load ir<%arrayidx>, vp<[[MASK]]> +; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<%index> +; CHECK-NEXT: WIDEN ir<%2> = load ir<%arrayidx2>, vp<[[MASK]]> +; CHECK-NEXT: WIDEN ir<%add> = add ir<%2>, ir<%1> +; CHECK-NEXT: CLONE ir<%arrayidx6> = getelementptr inbounds ir<%c>, vp<%index> +; CHECK-NEXT: WIDEN store ir<%arrayidx6>, ir<%add>, vp<[[MASK]]> +; CHECK-NEXT: EMIT vp<%index.next> = add vp<%index>, vp<[[POPCOUNT]]> +; CHECK-NEXT: EMIT vp<%active.lane.mask.next> = active lane mask vp<%index.next>, ir<%wide.trip.count>, ir<1> +; CHECK-NEXT: EMIT vp<%10> = not vp<%active.lane.mask.next> +; CHECK-NEXT: EMIT branch-on-cond vp<%10> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %indvars.iv = phi i64 [ 0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: ir<0> from ir-bb) +; CHECK-NEXT: IR %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv +; CHECK-NEXT: IR %1 = load i8, ptr %arrayidx, align 1 +; CHECK-NEXT: IR %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv +; CHECK-NEXT: IR %2 = load i8, ptr %arrayidx2, align 1 +; CHECK-NEXT: IR %add = add i8 %2, %1 +; CHECK-NEXT: IR %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv +; CHECK-NEXT: IR store i8 %add, ptr %arrayidx6, align 1 +; CHECK-NEXT: IR %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + %cmp11 = icmp sgt i32 %n, 0 + br i1 %cmp11, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext nneg i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr %b, i64 %indvars.iv + %1 = load i8, ptr %arrayidx2, align 1 + %add = add i8 %1, %0 + %arrayidx6 = getelementptr inbounds i8, ptr %c, i64 %indvars.iv + store i8 %add, ptr %arrayidx6, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll index 5c62ca3ff3d01..1138cb7dcf4c9 100644 --- a/llvm/test/Transforms/LoopVectorize/pointer-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/pointer-induction.ll @@ -438,8 +438,8 @@ define i64 @ivopt_widen_ptr_indvar_1(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_1( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -523,8 +523,8 @@ define i64 @ivopt_widen_ptr_indvar_2(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_2( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: @@ -630,8 +630,8 @@ define i64 @ivopt_widen_ptr_indvar_3(ptr noalias %a, i64 %stride, i64 %n) { ; ; STRIDED-LABEL: @ivopt_widen_ptr_indvar_3( ; STRIDED-NEXT: entry: -; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[TMP1:%.*]] = shl i64 [[STRIDE:%.*]], 3 +; STRIDED-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 ; STRIDED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 ; STRIDED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; STRIDED: vector.ph: diff --git a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll index faca86a41b023..0791e74f8a2a1 100644 --- a/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll +++ b/llvm/test/Transforms/LoopVectorize/reuse-lcssa-phi-scev-expansion.ll @@ -206,10 +206,15 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: br i1 [[INVAR_C]], label %[[LOOP_2_PREHEADER:.*]], label %[[LOOP_1]] ; CHECK: [[LOOP_2_PREHEADER]]: ; CHECK-NEXT: [[IV_1_LCSSA:%.*]] = phi i32 [ [[IV_1]], %[[LOOP_1]] ] +; CHECK-NEXT: [[TMP0:%.*]] = sub i32 2, [[STEP]] +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP0]] +; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) +; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[IV_1_LCSSA]], [[STEP]] ; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP1]], i32 0) ; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[STEP]], -2 -; CHECK-NEXT: [[TMP3:%.*]] = mul i32 [[INDVAR]], -1 ; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[SMAX]], [[TMP4]] ; CHECK-NEXT: [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP5]], i32 1) @@ -218,11 +223,6 @@ define void @expand_diff_scev_unknown(ptr %dst, i1 %invar.c, i32 %step) mustprog ; CHECK-NEXT: [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[STEP]], i32 1) ; CHECK-NEXT: [[TMP8:%.*]] = udiv i32 [[TMP7]], [[UMAX]] ; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP16:%.*]] = sub i32 2, [[STEP]] -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[IV_1_LCSSA]], [[TMP16]] -; CHECK-NEXT: [[SMAX1:%.*]] = call i32 @llvm.smax.i32(i32 [[TMP12]], i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP3]], -1 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[SMAX1]], [[TMP14]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP15]], 2 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]] ; CHECK: [[VECTOR_SCEVCHECK]]: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 91e0037d12c61..dce40fae54105 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -443,10 +443,10 @@ define void @print_expand_scev(i64 %y, ptr %ptr) { ; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count ; CHECK-EMPTY: ; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) +; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) ; CHECK-NEXT: IR %div = udiv i64 %y, 492802768830814060 ; CHECK-NEXT: IR %inc = add i64 %div, 1 -; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (1 + ((15 + (%y /u 492802768830814060)) /u (1 + (%y /u 492802768830814060)))) -; CHECK-NEXT: EMIT vp<[[EXP_SCEV:%.+]]> = EXPAND SCEV (1 + (%y /u 492802768830814060)) ; CHECK-NEXT: Successor(s): scalar.ph, vector.ph ; CHECK-EMPTY: ; CHECK-NEXT: vector.ph: