From 2addb7b55d40762a08a727f07567961acc05e260 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Mon, 15 Jul 2024 17:57:30 +0100 Subject: [PATCH 1/7] [AArch64] Add MATCH loops to LoopIdiomVectorizePass This patch adds a new loop to LoopIdiomVectorizePass, enabling it to recognise and use @llvm.experimental.vector.match to vectorise loops such as: char* find_first_of(char *first, char *last, char *s_first, char *s_last) { for (; first != last; ++first) for (char *it = s_first; it != s_last; ++it) if (*first == *it) return first; return last; } These loops match the C++ standard library's std::find_first_of. --- .../Vectorize/LoopIdiomVectorize.cpp | 423 ++++++++++++++++- llvm/test/CodeGen/AArch64/find-first-byte.ll | 429 ++++++++++++++++++ 2 files changed, 843 insertions(+), 9 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/find-first-byte.ll diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 7af7408ed67a8..a874dd9f8f181 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -10,8 +10,10 @@ // transforms them into more optimized versions of the same loop. In cases // where this happens, it can be a significant performance win. // -// We currently only recognize one loop that finds the first mismatched byte -// in an array and returns the index, i.e. something like: +// We currently support two loops: +// +// 1. A loop that finds the first mismatched byte in an array and returns the +// index, i.e. something like: // // while (++i != n) { // if (a[i] != b[i]) @@ -24,12 +26,6 @@ // boundaries. However, even with these checks it is still profitable to do the // transformation. // -//===----------------------------------------------------------------------===// -// -// NOTE: This Pass matches a really specific loop pattern because it's only -// supposed to be a temporary solution until our LoopVectorizer is powerful -// enought to vectorize it automatically. -// // TODO List: // // * Add support for the inverse case where we scan for a matching element. @@ -37,6 +33,35 @@ // * Recognize loops that increment the IV *after* comparing bytes. // * Allow 32-bit sign-extends of the IV used by the GEP. // +// 2. A loop that finds the first matching character in an array among a set of +// possible matches, e.g.: +// +// for (; first != last; ++first) +// for (s_it = s_first; s_it != s_last; ++s_it) +// if (*first == *s_it) +// return first; +// return last; +// +// This corresponds to std::find_first_of (for arrays of bytes) from the C++ +// standard library. This function can be implemented efficiently for targets +// that support @llvm.experimental.vector.match. For example, on AArch64 targets +// that implement SVE2, this lower to a MATCH instruction, which enables us to +// perform up to 16x16=256 comparisons in one go. This can lead to very +// significant speedups. +// +// TODO: +// +// * Add support for `find_first_not_of' loops (i.e. with not-equal comparison). +// * Make VF a configurable parameter (right now we assume 128-bit vectors). +// * Potentially adjust the cost model to let the transformation kick-in even if +// @llvm.experimental.vector.match doesn't have direct support in hardware. +// +//===----------------------------------------------------------------------===// +// +// NOTE: This Pass matches really specific loop patterns because it's only +// supposed to be a temporary solution until our LoopVectorizer is powerful +// enought to vectorize them automatically. +// //===----------------------------------------------------------------------===// #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" @@ -79,6 +104,11 @@ static cl::opt cl::desc("The vectorization factor for byte-compare patterns."), cl::init(16)); +static cl::opt + DisableFindFirstByte("disable-loop-idiom-vectorize-find-first-byte", + cl::Hidden, cl::init(false), + cl::desc("Do not convert find-first-byte loop(s).")); + static cl::opt VerifyLoops("loop-idiom-vectorize-verify", cl::Hidden, cl::init(false), cl::desc("Verify loops generated Loop Idiom Vectorize Pass.")); @@ -136,6 +166,19 @@ class LoopIdiomVectorize { PHINode *IndPhi, Value *MaxLen, Instruction *Index, Value *Start, bool IncIdx, BasicBlock *FoundBB, BasicBlock *EndBB); + + bool recognizeFindFirstByte(); + + Value *expandFindFirstByte(IRBuilder<> &Builder, DomTreeUpdater &DTU, + unsigned VF, Type *CharTy, BasicBlock *ExitSucc, + BasicBlock *ExitFail, Value *SearchStart, + Value *SearchEnd, Value *NeedleStart, + Value *NeedleEnd); + + void transformFindFirstByte(PHINode *IndPhi, unsigned VF, Type *CharTy, + BasicBlock *ExitSucc, BasicBlock *ExitFail, + Value *SearchStart, Value *SearchEnd, + Value *NeedleStart, Value *NeedleEnd); /// @} }; } // anonymous namespace @@ -190,7 +233,13 @@ bool LoopIdiomVectorize::run(Loop *L) { LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F[" << F.getName() << "] Loop %" << CurLoop->getHeader()->getName() << "\n"); - return recognizeByteCompare(); + if (recognizeByteCompare()) + return true; + + if (recognizeFindFirstByte()) + return true; + + return false; } bool LoopIdiomVectorize::recognizeByteCompare() { @@ -939,3 +988,359 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, report_fatal_error("Loops must remain in LCSSA form!"); } } + +bool LoopIdiomVectorize::recognizeFindFirstByte() { + // Currently the transformation only works on scalable vector types, although + // there is no fundamental reason why it cannot be made to work for fixed + // vectors too. + if (!TTI->supportsScalableVectors() || DisableFindFirstByte) + return false; + + // Define some constants we need throughout. + BasicBlock *Header = CurLoop->getHeader(); + LLVMContext &Ctx = Header->getContext(); + + // We are expecting the four blocks defined below: Header, MatchBB, InnerBB, + // and OuterBB. For now, we will bail our for almost anything else. The Four + // blocks contain one nested loop. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 4 || + CurLoop->getSubLoops().size() != 1) + return false; + + auto *InnerLoop = CurLoop->getSubLoops().front(); + PHINode *IndPhi = dyn_cast(&Header->front()); + if (!IndPhi || IndPhi->getNumIncomingValues() != 2) + return false; + + // Check instruction counts. + auto LoopBlocks = CurLoop->getBlocks(); + if (LoopBlocks[0]->sizeWithoutDebug() > 3 || + LoopBlocks[1]->sizeWithoutDebug() > 4 || + LoopBlocks[2]->sizeWithoutDebug() > 3 || + LoopBlocks[3]->sizeWithoutDebug() > 3) + return false; + + // Check that no instruction other than IndPhi has outside uses. + for (BasicBlock *BB : LoopBlocks) + for (Instruction &I : *BB) + if (&I != IndPhi) + for (User *U : I.users()) + if (!CurLoop->contains(cast(U))) + return false; + + // Match the branch instruction in the header. We are expecting an + // unconditional branch to the inner loop. + // + // Header: + // %14 = phi ptr [ %24, %OuterBB ], [ %3, %Header.preheader ] + // %15 = load i8, ptr %14, align 1 + // br label %MatchBB + BasicBlock *MatchBB; + if (!match(Header->getTerminator(), m_UnconditionalBr(MatchBB)) || + !InnerLoop->contains(MatchBB)) + return false; + + // MatchBB should be the entrypoint into the inner loop containing the + // comparison between a search element and a needle. + // + // MatchBB: + // %20 = phi ptr [ %7, %Header ], [ %17, %InnerBB ] + // %21 = load i8, ptr %20, align 1 + // %22 = icmp eq i8 %15, %21 + // br i1 %22, label %ExitSucc, label %InnerBB + BasicBlock *ExitSucc, *InnerBB; + Value *LoadA, *LoadB; + ICmpInst::Predicate MatchPred; + if (!match(MatchBB->getTerminator(), + m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)), + m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) || + MatchPred != ICmpInst::Predicate::ICMP_EQ || + !InnerLoop->contains(InnerBB)) + return false; + + // We expect outside uses of `IndPhi' in ExitSucc (and only there). + for (User *U : IndPhi->users()) + if (!CurLoop->contains(cast(U))) + if (auto *PN = dyn_cast(U); !PN || PN->getParent() != ExitSucc) + return false; + + // Match the loads and check they are simple. + Value *A, *B; + if (!match(LoadA, m_Load(m_Value(A))) || !cast(LoadA)->isSimple() || + !match(LoadB, m_Load(m_Value(B))) || !cast(LoadB)->isSimple()) + return false; + + // Check we are loading valid characters. + Type *CharTy = LoadA->getType(); + if (!CharTy->isIntegerTy() || LoadB->getType() != CharTy) + return false; + + // Pick the vectorisation factor based on CharTy, work out the cost of the + // match intrinsic and decide if we should use it. + // Note: For the time being we assume 128-bit vectors. + unsigned VF = 128 / CharTy->getIntegerBitWidth(); + SmallVector Args = { + ScalableVectorType::get(CharTy, VF), FixedVectorType::get(CharTy, VF), + ScalableVectorType::get(Type::getInt1Ty(Ctx), VF)}; + IntrinsicCostAttributes Attrs(Intrinsic::experimental_vector_match, Args[2], + Args); + if (TTI->getIntrinsicInstrCost(Attrs, TTI::TCK_SizeAndLatency) > 4) + return false; + + // The loads come from two PHIs, each with two incoming values. + PHINode *PNA = dyn_cast(A); + PHINode *PNB = dyn_cast(B); + if (!PNA || PNA->getNumIncomingValues() != 2 || !PNB || + PNB->getNumIncomingValues() != 2) + return false; + + // One PHI comes from the outer loop (PNA), the other one from the inner loop + // (PNB). PNA effectively corresponds to IndPhi. + if (InnerLoop->contains(PNA)) + std::swap(PNA, PNB); + if (PNA != &Header->front() || PNB != &MatchBB->front()) + return false; + + // The incoming values of both PHI nodes should be a gep of 1. + Value *StartA = PNA->getIncomingValue(0); + Value *IndexA = PNA->getIncomingValue(1); + if (CurLoop->contains(PNA->getIncomingBlock(0))) + std::swap(StartA, IndexA); + + Value *StartB = PNB->getIncomingValue(0); + Value *IndexB = PNB->getIncomingValue(1); + if (InnerLoop->contains(PNB->getIncomingBlock(0))) + std::swap(StartB, IndexB); + + // Match the GEPs. + if (!match(IndexA, m_GEP(m_Specific(PNA), m_One())) || + !match(IndexB, m_GEP(m_Specific(PNB), m_One()))) + return false; + + // Check the GEPs result type matches `CharTy'. + GetElementPtrInst *GEPA = cast(IndexA); + GetElementPtrInst *GEPB = cast(IndexB); + if (GEPA->getResultElementType() != CharTy || + GEPB->getResultElementType() != CharTy) + return false; + + // InnerBB should increment the address of the needle pointer. + // + // InnerBB: + // %17 = getelementptr inbounds i8, ptr %20, i64 1 + // %18 = icmp eq ptr %17, %10 + // br i1 %18, label %OuterBB, label %MatchBB + BasicBlock *OuterBB; + Value *EndB; + if (!match(InnerBB->getTerminator(), + m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)), + m_BasicBlock(OuterBB), m_Specific(MatchBB))) || + MatchPred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(OuterBB)) + return false; + + // OuterBB should increment the address of the search element pointer. + // + // OuterBB: + // %24 = getelementptr inbounds i8, ptr %14, i64 1 + // %25 = icmp eq ptr %24, %6 + // br i1 %25, label %ExitFail, label %Header + BasicBlock *ExitFail; + Value *EndA; + if (!match(OuterBB->getTerminator(), + m_Br(m_ICmp(MatchPred, m_Specific(GEPA), m_Value(EndA)), + m_BasicBlock(ExitFail), m_Specific(Header))) || + MatchPred != ICmpInst::Predicate::ICMP_EQ) + return false; + + LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n"); + + transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, StartA, EndA, + StartB, EndB); + return true; +} + +Value *LoopIdiomVectorize::expandFindFirstByte( + IRBuilder<> &Builder, DomTreeUpdater &DTU, unsigned VF, Type *CharTy, + BasicBlock *ExitSucc, BasicBlock *ExitFail, Value *SearchStart, + Value *SearchEnd, Value *NeedleStart, Value *NeedleEnd) { + // Set up some types and constants that we intend to reuse. + auto *PtrTy = Builder.getPtrTy(); + auto *I64Ty = Builder.getInt64Ty(); + auto *PredVTy = ScalableVectorType::get(Builder.getInt1Ty(), VF); + auto *CharVTy = ScalableVectorType::get(CharTy, VF); + auto *ConstVF = ConstantInt::get(I64Ty, VF); + + // Other common arguments. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + LLVMContext &Ctx = Preheader->getContext(); + Value *Passthru = ConstantInt::getNullValue(CharVTy); + + // Split block in the original loop preheader. + // SPH is the new preheader to the old scalar loop. + BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, + nullptr, "scalar_ph"); + + // Create the blocks that we're going to use. + // + // We will have the following loops: + // (O) Outer loop where we iterate over the elements of the search array. + // (I) Inner loop where we iterate over the elements of the needle array. + // + // Overall, the blocks do the following: + // (1) Load the search array. Go to (2). + // (2) (a) Load the needle array. + // (b) Splat the first element to the inactive lanes. + // (c) Check if any elements match. If so go to (3), otherwise go to (4). + // (3) Compute the index of the first match and exit. + // (4) Check if we've reached the end of the needle array. If not loop back to + // (2), otherwise go to (5). + // (5) Check if we've reached the end of the search array. If not loop back to + // (1), otherwise exit. + // Block (3) is not part of any loop. Blocks (1,5) and (2,4) belong to the + // outer and inner loops, respectively. + BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); + BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); + BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); + BasicBlock *BB4 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); + BasicBlock *BB5 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); + + // Update LoopInfo with the new loops. + auto OuterLoop = LI->AllocateLoop(); + auto InnerLoop = LI->AllocateLoop(); + + if (auto ParentLoop = CurLoop->getParentLoop()) { + ParentLoop->addChildLoop(OuterLoop); + ParentLoop->addBasicBlockToLoop(BB3, *LI); + } else { + LI->addTopLevelLoop(OuterLoop); + } + + // Add the inner loop to the outer. + OuterLoop->addChildLoop(InnerLoop); + + // Add the new basic blocks to the corresponding loops. + OuterLoop->addBasicBlockToLoop(BB1, *LI); + OuterLoop->addBasicBlockToLoop(BB5, *LI); + InnerLoop->addBasicBlockToLoop(BB2, *LI); + InnerLoop->addBasicBlockToLoop(BB4, *LI); + + // Set a reference to the old scalar loop and create a predicate of VF + // elements. + Builder.SetInsertPoint(Preheader->getTerminator()); + Value *Pred16 = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {ConstantInt::get(I64Ty, 0), ConstVF}); + Builder.CreateCondBr(Builder.getFalse(), SPH, BB1); + Preheader->getTerminator()->eraseFromParent(); + DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1}}); + + // (1) Load the search array and branch to the inner loop. + Builder.SetInsertPoint(BB1); + PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch"); + Value *PredSearch = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {Builder.CreatePointerCast(Search, I64Ty), + Builder.CreatePointerCast(SearchEnd, I64Ty)}); + PredSearch = Builder.CreateAnd(Pred16, PredSearch); + Value *LoadSearch = + Builder.CreateMaskedLoad(CharVTy, Search, Align(1), PredSearch, Passthru); + Builder.CreateBr(BB2); + DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}}); + + // (2) Inner loop. + Builder.SetInsertPoint(BB2); + PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle"); + + // (2.a) Load the needle array. + Value *PredNeedle = + Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {Builder.CreatePointerCast(Needle, I64Ty), + Builder.CreatePointerCast(NeedleEnd, I64Ty)}); + PredNeedle = Builder.CreateAnd(Pred16, PredNeedle); + Value *LoadNeedle = + Builder.CreateMaskedLoad(CharVTy, Needle, Align(1), PredNeedle, Passthru); + + // (2.b) Splat the first element to the inactive lanes. + Value *Needle0 = Builder.CreateExtractElement(LoadNeedle, uint64_t(0)); + Value *Needle0Splat = + Builder.CreateVectorSplat(ElementCount::getScalable(VF), Needle0); + LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat); + LoadNeedle = Builder.CreateExtractVector( + FixedVectorType::get(CharTy, VF), LoadNeedle, ConstantInt::get(I64Ty, 0)); + + // (2.c) Test if there's a match. + Value *MatchPred = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()}, + {LoadSearch, LoadNeedle, PredSearch}); + Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred); + Builder.CreateCondBr(IfAnyMatch, BB3, BB4); + DTU.applyUpdates( + {{DominatorTree::Insert, BB2, BB3}, {DominatorTree::Insert, BB2, BB4}}); + + // (3) We found a match. Compute the index of its location and exit. + Builder.SetInsertPoint(BB3); + Value *MatchCnt = Builder.CreateIntrinsic( + Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()}, + {MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)}); + Value *MatchVal = Builder.CreateGEP(CharTy, Search, MatchCnt); + Builder.CreateBr(ExitSucc); + DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}}); + + // (4) Check if we've reached the end of the needle array. + Builder.SetInsertPoint(BB4); + Value *NextNeedle = Builder.CreateGEP(CharTy, Needle, ConstVF); + Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5); + DTU.applyUpdates( + {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}}); + + // (5) Check if we've reached the end of the search array. + Builder.SetInsertPoint(BB5); + Value *NextSearch = Builder.CreateGEP(CharTy, Search, ConstVF); + Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1, + ExitFail); + DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1}, + {DominatorTree::Insert, BB5, ExitFail}}); + + // Set up the PHI's. + Search->addIncoming(SearchStart, Preheader); + Search->addIncoming(NextSearch, BB5); + Needle->addIncoming(NeedleStart, BB1); + Needle->addIncoming(NextNeedle, BB4); + + if (VerifyLoops) { + OuterLoop->verifyLoop(); + InnerLoop->verifyLoop(); + if (!OuterLoop->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } + + return MatchVal; +} + +void LoopIdiomVectorize::transformFindFirstByte( + PHINode *IndPhi, unsigned VF, Type *CharTy, BasicBlock *ExitSucc, + BasicBlock *ExitFail, Value *SearchStart, Value *SearchEnd, + Value *NeedleStart, Value *NeedleEnd) { + // Insert the find first byte code at the end of the preheader block. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + BranchInst *PHBranch = cast(Preheader->getTerminator()); + IRBuilder<> Builder(PHBranch); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Builder.SetCurrentDebugLocation(PHBranch->getDebugLoc()); + + Value *MatchVal = + expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail, + SearchStart, SearchEnd, NeedleStart, NeedleEnd); + + // Add new incoming values with the result of the transformation to PHINodes + // of ExitSucc that use IndPhi. + for (auto *U : llvm::make_early_inc_range(IndPhi->users())) + if (auto *PN = dyn_cast(U); PN && PN->getParent() == ExitSucc) + PN->addIncoming(MatchVal, cast(MatchVal)->getParent()); + + if (VerifyLoops && CurLoop->getParentLoop()) { + CurLoop->getParentLoop()->verifyLoop(); + if (!CurLoop->getParentLoop()->isRecursivelyLCSSAForm(*DT, *LI)) + report_fatal_error("Loops must remain in LCSSA form!"); + } +} diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll new file mode 100644 index 0000000000000..a324896413d78 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll @@ -0,0 +1,429 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -verify-loop-info -verify-dom-info -S < %s | FileCheck %s + +; Base case based on `libcxx/include/__algorithm/find_first_of.h': +; char* find_first_of(char *first, char *last, char *s_first, char *s_last) { +; for (; first != last; ++first) +; for (char *it = s_first; it != s_last; ++it) +; if (*first == *it) +; return first; +; return last; +; } +define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8( +; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]] +; CHECK: [[_PREHEADER:.*:]] +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]] +; CHECK: [[BB9]]: +; CHECK-NEXT: [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP10]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = and [[TMP8]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[SEARCH]], i32 1, [[TMP13]], zeroinitializer) +; CHECK-NEXT: br label %[[BB15:.*]] +; CHECK: [[BB15]]: +; CHECK-NEXT: [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[TMP17]]) +; CHECK-NEXT: [[TMP19:%.*]] = and [[TMP8]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[NEEDLE]], i32 1, [[TMP19]], zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP21]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = select [[TMP19]], [[TMP20]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[TMP22]], i64 0) +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[TMP14]], <16 x i8> [[TMP23]], [[TMP13]]) +; CHECK-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP24]]) +; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]] +; CHECK: [[BB26]]: +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP24]], i1 true) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SEARCH]], i64 [[TMP27]] +; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] +; CHECK: [[TMP29]]: +; CHECK-NEXT: [[TMP30]] = getelementptr i8, ptr [[NEEDLE]], i64 16 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]] +; CHECK: [[TMP32]]: +; CHECK-NEXT: [[TMP33]] = getelementptr i8, ptr [[SEARCH]], i64 16 +; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[BB35:.*]] +; CHECK: [[BB35]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 +; CHECK-NEXT: br label %[[BB41:.*]] +; CHECK: [[BB38:.*]]: +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP42:%.*]], i64 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]] +; CHECK: [[BB41]]: +; CHECK-NEXT: [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ] +; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i8 [[TMP37]], [[TMP43]] +; CHECK-NEXT: br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]] +; CHECK: [[TMP45]]: +; CHECK-NEXT: [[TMP46]] = getelementptr inbounds i8, ptr [[TMP36]], i64 1 +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]] +; CHECK: [[_LOOPEXIT:.*:]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ] +; CHECK-NEXT: br label %[[BB48]] +; CHECK: [[_LOOPEXIT1:.*:]] +; CHECK-NEXT: br label %[[BB48]] +; CHECK: [[BB48]]: +; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[TMP49]] +; + %5 = icmp eq ptr %0, %1 + %6 = icmp eq ptr %2, %3 + %7 = or i1 %5, %6 + br i1 %7, label %21, label %8 + +8: + %9 = phi ptr [ %19, %18 ], [ %0, %4 ] + %10 = load i8, ptr %9, align 1 + br label %14 + +11: + %12 = getelementptr inbounds i8, ptr %15, i64 1 + %13 = icmp eq ptr %12, %3 + br i1 %13, label %18, label %14 + +14: + %15 = phi ptr [ %2, %8 ], [ %12, %11 ] + %16 = load i8, ptr %15, align 1 + %17 = icmp eq i8 %10, %16 + br i1 %17, label %21, label %11 + +18: + %19 = getelementptr inbounds i8, ptr %9, i64 1 + %20 = icmp eq ptr %19, %1 + br i1 %20, label %21, label %8 + +21: + %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] + ret ptr %22 +} + +; Same as @find_first_of_i8 but with i16. +; This is accepted and generates a similar loop. +define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +; CHECK-LABEL: define ptr @find_first_of_i16( +; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]] +; CHECK: [[_PREHEADER:.*:]] +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) +; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]] +; CHECK: [[BB9]]: +; CHECK-NEXT: [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP10]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = and [[TMP8]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[SEARCH]], i32 1, [[TMP13]], zeroinitializer) +; CHECK-NEXT: br label %[[BB15:.*]] +; CHECK: [[BB15]]: +; CHECK-NEXT: [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP16]], i64 [[TMP17]]) +; CHECK-NEXT: [[TMP19:%.*]] = and [[TMP8]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[NEEDLE]], i32 1, [[TMP19]], zeroinitializer) +; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[TMP21]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = select [[TMP19]], [[TMP20]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP23:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16( [[TMP22]], i64 0) +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.experimental.vector.match.nxv8i16.v8i16( [[TMP14]], <8 x i16> [[TMP23]], [[TMP13]]) +; CHECK-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1( [[TMP24]]) +; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]] +; CHECK: [[BB26]]: +; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( [[TMP24]], i1 true) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[SEARCH]], i64 [[TMP27]] +; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] +; CHECK: [[TMP29]]: +; CHECK-NEXT: [[TMP30]] = getelementptr i16, ptr [[NEEDLE]], i64 8 +; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]] +; CHECK: [[TMP32]]: +; CHECK-NEXT: [[TMP33]] = getelementptr i16, ptr [[SEARCH]], i64 8 +; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[BB35:.*]] +; CHECK: [[BB35]]: +; CHECK-NEXT: [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP36]], align 1 +; CHECK-NEXT: br label %[[BB41:.*]] +; CHECK: [[BB38:.*]]: +; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i16, ptr [[TMP42:%.*]], i64 1 +; CHECK-NEXT: [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]] +; CHECK: [[BB41]]: +; CHECK-NEXT: [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ] +; CHECK-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP42]], align 1 +; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i16 [[TMP37]], [[TMP43]] +; CHECK-NEXT: br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]] +; CHECK: [[TMP45]]: +; CHECK-NEXT: [[TMP46]] = getelementptr inbounds i16, ptr [[TMP36]], i64 1 +; CHECK-NEXT: [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]] +; CHECK: [[_LOOPEXIT:.*:]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ] +; CHECK-NEXT: br label %[[BB48]] +; CHECK: [[_LOOPEXIT1:.*:]] +; CHECK-NEXT: br label %[[BB48]] +; CHECK: [[BB48]]: +; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[TMP49]] +; + %5 = icmp eq ptr %0, %1 + %6 = icmp eq ptr %2, %3 + %7 = or i1 %5, %6 + br i1 %7, label %21, label %8 + +8: + %9 = phi ptr [ %19, %18 ], [ %0, %4 ] + %10 = load i16, ptr %9, align 1 + br label %14 + +11: + %12 = getelementptr inbounds i16, ptr %15, i64 1 + %13 = icmp eq ptr %12, %3 + br i1 %13, label %18, label %14 + +14: + %15 = phi ptr [ %2, %8 ], [ %12, %11 ] + %16 = load i16, ptr %15, align 1 + %17 = icmp eq i16 %10, %16 + br i1 %17, label %21, label %11 + +18: + %19 = getelementptr inbounds i16, ptr %9, i64 1 + %20 = icmp eq ptr %19, %1 + br i1 %20, label %21, label %8 + +21: + %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] + ret ptr %22 +} + +; Same as @find_first_of_i8 but with `ne' comparison. +; This is rejected for now, but should eventually be supported. +define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +; CHECK-LABEL: define ptr @find_first_not_of_i8( +; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] +; CHECK: [[_PREHEADER:.*:]] +; CHECK-NEXT: br label %[[BB8:.*]] +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 +; CHECK-NEXT: br label %[[BB14:.*]] +; CHECK: [[BB11:.*]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] +; CHECK: [[TMP18]]: +; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] +; CHECK: [[_LOOPEXIT:.*:]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] +; CHECK-NEXT: br label %[[BB21]] +; CHECK: [[_LOOPEXIT1:.*:]] +; CHECK-NEXT: br label %[[BB21]] +; CHECK: [[BB21]]: +; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[TMP22]] +; + %5 = icmp eq ptr %0, %1 + %6 = icmp eq ptr %2, %3 + %7 = or i1 %5, %6 + br i1 %7, label %21, label %8 + +8: + %9 = phi ptr [ %19, %18 ], [ %0, %4 ] + %10 = load i8, ptr %9, align 1 + br label %14 + +11: + %12 = getelementptr inbounds i8, ptr %15, i64 1 + %13 = icmp eq ptr %12, %3 + br i1 %13, label %18, label %14 + +14: + %15 = phi ptr [ %2, %8 ], [ %12, %11 ] + %16 = load i8, ptr %15, align 1 + %17 = icmp ne i8 %10, %16 + br i1 %17, label %21, label %11 + +18: + %19 = getelementptr inbounds i8, ptr %9, i64 1 + %20 = icmp eq ptr %19, %1 + br i1 %20, label %21, label %8 + +21: + %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] + ret ptr %22 +} + +; This is the same as @find_first_of_i8 but without SVE2, which we require to +; perform the conversion. +define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) { +; CHECK-LABEL: define ptr @find_first_of_i8_nosve2( +; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] +; CHECK: [[_PREHEADER:.*:]] +; CHECK-NEXT: br label %[[BB8:.*]] +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 +; CHECK-NEXT: br label %[[BB14:.*]] +; CHECK: [[BB11:.*]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] +; CHECK: [[TMP18]]: +; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] +; CHECK: [[_LOOPEXIT:.*:]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] +; CHECK-NEXT: br label %[[BB21]] +; CHECK: [[_LOOPEXIT1:.*:]] +; CHECK-NEXT: br label %[[BB21]] +; CHECK: [[BB21]]: +; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[TMP22]] +; + %5 = icmp eq ptr %0, %1 + %6 = icmp eq ptr %2, %3 + %7 = or i1 %5, %6 + br i1 %7, label %21, label %8 + +8: + %9 = phi ptr [ %19, %18 ], [ %0, %4 ] + %10 = load i8, ptr %9, align 1 + br label %14 + +11: + %12 = getelementptr inbounds i8, ptr %15, i64 1 + %13 = icmp eq ptr %12, %3 + br i1 %13, label %18, label %14 + +14: + %15 = phi ptr [ %2, %8 ], [ %12, %11 ] + %16 = load i8, ptr %15, align 1 + %17 = icmp eq i8 %10, %16 + br i1 %17, label %21, label %11 + +18: + %19 = getelementptr inbounds i8, ptr %9, i64 1 + %20 = icmp eq ptr %19, %1 + br i1 %20, label %21, label %8 + +21: + %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] + ret ptr %22 +} + +; Same as @find_first_of_i8 but here we use the inner PHI outside the loop nest. +; This isn't supported. +define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8_outside_use( +; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] +; CHECK: [[_PREHEADER:.*:]] +; CHECK-NEXT: br label %[[BB8:.*]] +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] +; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 +; CHECK-NEXT: br label %[[BB14:.*]] +; CHECK: [[BB11:.*]]: +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] +; CHECK: [[BB14]]: +; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] +; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]] +; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] +; CHECK: [[TMP18]]: +; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] +; CHECK: [[_LOOPEXIT:.*:]] +; CHECK-NEXT: [[DOTLCSSA3:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP15]], %[[BB14]] ] +; CHECK-NEXT: br label %[[BB21]] +; CHECK: [[_LOOPEXIT1:.*:]] +; CHECK-NEXT: br label %[[BB21]] +; CHECK: [[BB21]]: +; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA3]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: [[TMP23:%.*]] = phi ptr [ [[TMP3]], [[TMP4]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP3]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[TMP23]] +; + %5 = icmp eq ptr %0, %1 + %6 = icmp eq ptr %2, %3 + %7 = or i1 %5, %6 + br i1 %7, label %21, label %8 + +8: + %9 = phi ptr [ %19, %18 ], [ %0, %4 ] + %10 = load i8, ptr %9, align 1 + br label %14 + +11: + %12 = getelementptr inbounds i8, ptr %15, i64 1 + %13 = icmp eq ptr %12, %3 + br i1 %13, label %18, label %14 + +14: + %15 = phi ptr [ %2, %8 ], [ %12, %11 ] + %16 = load i8, ptr %15, align 1 + %17 = icmp ne i8 %10, %16 + br i1 %17, label %21, label %11 + +18: + %19 = getelementptr inbounds i8, ptr %9, i64 1 + %20 = icmp eq ptr %19, %1 + br i1 %20, label %21, label %8 + +21: + %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] + %23 = phi ptr [ %3, %4 ], [ %15, %14 ], [ %3, %18 ] + ret ptr %23 +} + +attributes #0 = { "target-features"="+sve2" } From 2241058094a49c63a5912f5d72a0b69a9236fe02 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Wed, 11 Dec 2024 02:37:58 -0800 Subject: [PATCH 2/7] Add RUN line with -disable(...) and refactor tests --- llvm/test/CodeGen/AArch64/find-first-byte.ll | 219 +++++++++---------- 1 file changed, 103 insertions(+), 116 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll index a324896413d78..202ee0982d95a 100644 --- a/llvm/test/CodeGen/AArch64/find-first-byte.ll +++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -verify-loop-info -verify-dom-info -S < %s | FileCheck %s +; RUN: opt -mtriple=aarch64 -mattr=+sve -passes='loop(loop-idiom-vectorize)' -disable-loop-idiom-vectorize-find-first-byte -S < %s | FileCheck -check-prefix=DISABLE %s ; Base case based on `libcxx/include/__algorithm/find_first_of.h': ; char* find_first_of(char *first, char *last, char *s_first, char *s_last) { @@ -20,20 +21,20 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]] ; CHECK: [[BB9]]: -; CHECK-NEXT: [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64 +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP10]], i64 [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = and [[TMP8]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[SEARCH]], i32 1, [[TMP13]], zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[TMP13]], zeroinitializer) ; CHECK-NEXT: br label %[[BB15:.*]] ; CHECK: [[BB15]]: -; CHECK-NEXT: [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] -; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64 +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[TMP17]]) ; CHECK-NEXT: [[TMP19:%.*]] = and [[TMP8]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[NEEDLE]], i32 1, [[TMP19]], zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[TMP19]], zeroinitializer) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i64 0 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP21]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer @@ -44,14 +45,14 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]] ; CHECK: [[BB26]]: ; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP24]], i1 true) -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[SEARCH]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[PSEARCH]], i64 [[TMP27]] ; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] ; CHECK: [[TMP29]]: -; CHECK-NEXT: [[TMP30]] = getelementptr i8, ptr [[NEEDLE]], i64 16 +; CHECK-NEXT: [[TMP30]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 ; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]] ; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]] ; CHECK: [[TMP32]]: -; CHECK-NEXT: [[TMP33]] = getelementptr i8, ptr [[SEARCH]], i64 16 +; CHECK-NEXT: [[TMP33]] = getelementptr i8, ptr [[PSEARCH]], i64 16 ; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]] ; CHECK: [[SCALAR_PH]]: @@ -81,6 +82,40 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK: [[BB48]]: ; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] ; CHECK-NEXT: ret ptr [[TMP49]] +; +; DISABLE-LABEL: define ptr @find_first_of_i8( +; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { +; DISABLE-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] +; DISABLE-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] +; DISABLE-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; DISABLE-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] +; DISABLE: [[_PREHEADER:.*:]] +; DISABLE-NEXT: br label %[[BB8:.*]] +; DISABLE: [[BB8]]: +; DISABLE-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] +; DISABLE-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 +; DISABLE-NEXT: br label %[[BB14:.*]] +; DISABLE: [[BB11:.*]]: +; DISABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 +; DISABLE-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] +; DISABLE-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] +; DISABLE: [[BB14]]: +; DISABLE-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] +; DISABLE-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 +; DISABLE-NEXT: [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]] +; DISABLE-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] +; DISABLE: [[TMP18]]: +; DISABLE-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 +; DISABLE-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] +; DISABLE-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] +; DISABLE: [[_LOOPEXIT:.*:]] +; DISABLE-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] +; DISABLE-NEXT: br label %[[BB21]] +; DISABLE: [[_LOOPEXIT1:.*:]] +; DISABLE-NEXT: br label %[[BB21]] +; DISABLE: [[BB21]]: +; DISABLE-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; DISABLE-NEXT: ret ptr [[TMP22]] ; %5 = icmp eq ptr %0, %1 %6 = icmp eq ptr %2, %3 @@ -116,6 +151,7 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; Same as @find_first_of_i8 but with i16. ; This is accepted and generates a similar loop. define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +; ; CHECK-LABEL: define ptr @find_first_of_i16( ; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] @@ -126,20 +162,20 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) ; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]] ; CHECK: [[BB9]]: -; CHECK-NEXT: [[SEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[SEARCH]] to i64 +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64 ; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP10]], i64 [[TMP11]]) ; CHECK-NEXT: [[TMP13:%.*]] = and [[TMP8]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[SEARCH]], i32 1, [[TMP13]], zeroinitializer) +; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, [[TMP13]], zeroinitializer) ; CHECK-NEXT: br label %[[BB15:.*]] ; CHECK: [[BB15]]: -; CHECK-NEXT: [[NEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] -; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[NEEDLE]] to i64 +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP16]], i64 [[TMP17]]) ; CHECK-NEXT: [[TMP19:%.*]] = and [[TMP8]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[NEEDLE]], i32 1, [[TMP19]], zeroinitializer) +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, [[TMP19]], zeroinitializer) ; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i64 0 ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[TMP21]], i64 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer @@ -150,14 +186,14 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]] ; CHECK: [[BB26]]: ; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( [[TMP24]], i1 true) -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[SEARCH]], i64 [[TMP27]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[PSEARCH]], i64 [[TMP27]] ; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] ; CHECK: [[TMP29]]: -; CHECK-NEXT: [[TMP30]] = getelementptr i16, ptr [[NEEDLE]], i64 8 +; CHECK-NEXT: [[TMP30]] = getelementptr i16, ptr [[PNEEDLE]], i64 8 ; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]] ; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]] ; CHECK: [[TMP32]]: -; CHECK-NEXT: [[TMP33]] = getelementptr i16, ptr [[SEARCH]], i64 8 +; CHECK-NEXT: [[TMP33]] = getelementptr i16, ptr [[PSEARCH]], i64 8 ; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]] ; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]] ; CHECK: [[SCALAR_PH]]: @@ -187,6 +223,40 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK: [[BB48]]: ; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] ; CHECK-NEXT: ret ptr [[TMP49]] +; +; DISABLE-LABEL: define ptr @find_first_of_i16( +; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { +; DISABLE-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] +; DISABLE-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] +; DISABLE-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] +; DISABLE-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] +; DISABLE: [[_PREHEADER:.*:]] +; DISABLE-NEXT: br label %[[BB8:.*]] +; DISABLE: [[BB8]]: +; DISABLE-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] +; DISABLE-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP9]], align 1 +; DISABLE-NEXT: br label %[[BB14:.*]] +; DISABLE: [[BB11:.*]]: +; DISABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP15:%.*]], i64 1 +; DISABLE-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] +; DISABLE-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] +; DISABLE: [[BB14]]: +; DISABLE-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] +; DISABLE-NEXT: [[TMP16:%.*]] = load i16, ptr [[TMP15]], align 1 +; DISABLE-NEXT: [[TMP17:%.*]] = icmp eq i16 [[TMP10]], [[TMP16]] +; DISABLE-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] +; DISABLE: [[TMP18]]: +; DISABLE-NEXT: [[TMP19]] = getelementptr inbounds i16, ptr [[TMP9]], i64 1 +; DISABLE-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] +; DISABLE-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] +; DISABLE: [[_LOOPEXIT:.*:]] +; DISABLE-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] +; DISABLE-NEXT: br label %[[BB21]] +; DISABLE: [[_LOOPEXIT1:.*:]] +; DISABLE-NEXT: br label %[[BB21]] +; DISABLE: [[BB21]]: +; DISABLE-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; DISABLE-NEXT: ret ptr [[TMP22]] ; %5 = icmp eq ptr %0, %1 %6 = icmp eq ptr %2, %3 @@ -219,42 +289,17 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ret ptr %22 } +; From here on we only test for the presence/absence of the intrinsic. +; UTC_ARGS: --disable + ; Same as @find_first_of_i8 but with `ne' comparison. ; This is rejected for now, but should eventually be supported. define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-LABEL: define ptr @find_first_not_of_i8( -; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] -; CHECK: [[_PREHEADER:.*:]] -; CHECK-NEXT: br label %[[BB8:.*]] -; CHECK: [[BB8]]: -; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: br label %[[BB14:.*]] -; CHECK: [[BB11:.*]]: -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] -; CHECK: [[BB14]]: -; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] -; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]] -; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] -; CHECK: [[TMP18]]: -; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] -; CHECK: [[_LOOPEXIT:.*:]] -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] -; CHECK-NEXT: br label %[[BB21]] -; CHECK: [[_LOOPEXIT1:.*:]] -; CHECK-NEXT: br label %[[BB21]] -; CHECK: [[BB21]]: -; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: ret ptr [[TMP22]] +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_not_of_i8( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; %5 = icmp eq ptr %0, %1 %6 = icmp eq ptr %2, %3 @@ -291,38 +336,10 @@ define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; perform the conversion. define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) { ; CHECK-LABEL: define ptr @find_first_of_i8_nosve2( -; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] -; CHECK: [[_PREHEADER:.*:]] -; CHECK-NEXT: br label %[[BB8:.*]] -; CHECK: [[BB8]]: -; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: br label %[[BB14:.*]] -; CHECK: [[BB11:.*]]: -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] -; CHECK: [[BB14]]: -; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] -; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]] -; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] -; CHECK: [[TMP18]]: -; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] -; CHECK: [[_LOOPEXIT:.*:]] -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] -; CHECK-NEXT: br label %[[BB21]] -; CHECK: [[_LOOPEXIT1:.*:]] -; CHECK-NEXT: br label %[[BB21]] -; CHECK: [[BB21]]: -; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: ret ptr [[TMP22]] +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_of_i8_nosve2( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; %5 = icmp eq ptr %0, %1 %6 = icmp eq ptr %2, %3 @@ -359,40 +376,10 @@ define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) { ; This isn't supported. define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-LABEL: define ptr @find_first_of_i8_outside_use( -; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] -; CHECK: [[_PREHEADER:.*:]] -; CHECK-NEXT: br label %[[BB8:.*]] -; CHECK: [[BB8]]: -; CHECK-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 -; CHECK-NEXT: br label %[[BB14:.*]] -; CHECK: [[BB11:.*]]: -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] -; CHECK: [[BB14]]: -; CHECK-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] -; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP17:%.*]] = icmp ne i8 [[TMP10]], [[TMP16]] -; CHECK-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] -; CHECK: [[TMP18]]: -; CHECK-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] -; CHECK: [[_LOOPEXIT:.*:]] -; CHECK-NEXT: [[DOTLCSSA3:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP15]], %[[BB14]] ] -; CHECK-NEXT: br label %[[BB21]] -; CHECK: [[_LOOPEXIT1:.*:]] -; CHECK-NEXT: br label %[[BB21]] -; CHECK: [[BB21]]: -; CHECK-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA3]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: [[TMP23:%.*]] = phi ptr [ [[TMP3]], [[TMP4]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP3]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: ret ptr [[TMP23]] +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_of_i8_outside_use( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; %5 = icmp eq ptr %0, %1 %6 = icmp eq ptr %2, %3 From 042bda352a7f7fcf8b9b8fb943a8de639ce03e69 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Wed, 11 Dec 2024 09:19:07 -0800 Subject: [PATCH 3/7] Add page boundary checks and address other comments --- .../Vectorize/LoopIdiomVectorize.cpp | 183 +++++++----- llvm/test/CodeGen/AArch64/find-first-byte.ll | 269 ++++++++++-------- 2 files changed, 261 insertions(+), 191 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index a874dd9f8f181..7c42cdf056a93 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -992,8 +992,10 @@ void LoopIdiomVectorize::transformByteCompare(GetElementPtrInst *GEPA, bool LoopIdiomVectorize::recognizeFindFirstByte() { // Currently the transformation only works on scalable vector types, although // there is no fundamental reason why it cannot be made to work for fixed - // vectors too. - if (!TTI->supportsScalableVectors() || DisableFindFirstByte) + // vectors. We also need to know the target's minimum page size in order to + // generate runtime memory checks to ensure the vector version won't fault. + if (!TTI->supportsScalableVectors() || !TTI->getMinPageSize().has_value() || + DisableFindFirstByte) return false; // Define some constants we need throughout. @@ -1049,30 +1051,33 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { // %22 = icmp eq i8 %15, %21 // br i1 %22, label %ExitSucc, label %InnerBB BasicBlock *ExitSucc, *InnerBB; - Value *LoadA, *LoadB; - ICmpInst::Predicate MatchPred; + Value *LoadSearch, *LoadNeedle; + CmpPredicate MatchPred; if (!match(MatchBB->getTerminator(), - m_Br(m_ICmp(MatchPred, m_Value(LoadA), m_Value(LoadB)), + m_Br(m_ICmp(MatchPred, m_Value(LoadSearch), m_Value(LoadNeedle)), m_BasicBlock(ExitSucc), m_BasicBlock(InnerBB))) || - MatchPred != ICmpInst::Predicate::ICMP_EQ || - !InnerLoop->contains(InnerBB)) + MatchPred != ICmpInst::ICMP_EQ || !InnerLoop->contains(InnerBB)) return false; // We expect outside uses of `IndPhi' in ExitSucc (and only there). for (User *U : IndPhi->users()) - if (!CurLoop->contains(cast(U))) - if (auto *PN = dyn_cast(U); !PN || PN->getParent() != ExitSucc) + if (!CurLoop->contains(cast(U))) { + auto *PN = dyn_cast(U); + if (!PN || PN->getParent() != ExitSucc) return false; + } // Match the loads and check they are simple. - Value *A, *B; - if (!match(LoadA, m_Load(m_Value(A))) || !cast(LoadA)->isSimple() || - !match(LoadB, m_Load(m_Value(B))) || !cast(LoadB)->isSimple()) + Value *Search, *Needle; + if (!match(LoadSearch, m_Load(m_Value(Search))) || + !match(LoadNeedle, m_Load(m_Value(Needle))) || + !cast(LoadSearch)->isSimple() || + !cast(LoadNeedle)->isSimple()) return false; // Check we are loading valid characters. - Type *CharTy = LoadA->getType(); - if (!CharTy->isIntegerTy() || LoadB->getType() != CharTy) + Type *CharTy = LoadSearch->getType(); + if (!CharTy->isIntegerTy() || LoadNeedle->getType() != CharTy) return false; // Pick the vectorisation factor based on CharTy, work out the cost of the @@ -1088,40 +1093,40 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { return false; // The loads come from two PHIs, each with two incoming values. - PHINode *PNA = dyn_cast(A); - PHINode *PNB = dyn_cast(B); - if (!PNA || PNA->getNumIncomingValues() != 2 || !PNB || - PNB->getNumIncomingValues() != 2) + PHINode *PSearch = dyn_cast(Search); + PHINode *PNeedle = dyn_cast(Needle); + if (!PSearch || PSearch->getNumIncomingValues() != 2 || !PNeedle || + PNeedle->getNumIncomingValues() != 2) return false; - // One PHI comes from the outer loop (PNA), the other one from the inner loop - // (PNB). PNA effectively corresponds to IndPhi. - if (InnerLoop->contains(PNA)) - std::swap(PNA, PNB); - if (PNA != &Header->front() || PNB != &MatchBB->front()) + // One PHI comes from the outer loop (PSearch), the other one from the inner + // loop (PNeedle). PSearch effectively corresponds to IndPhi. + if (InnerLoop->contains(PSearch)) + std::swap(PSearch, PNeedle); + if (PSearch != &Header->front() || PNeedle != &MatchBB->front()) return false; // The incoming values of both PHI nodes should be a gep of 1. - Value *StartA = PNA->getIncomingValue(0); - Value *IndexA = PNA->getIncomingValue(1); - if (CurLoop->contains(PNA->getIncomingBlock(0))) - std::swap(StartA, IndexA); + Value *SearchStart = PSearch->getIncomingValue(0); + Value *SearchIndex = PSearch->getIncomingValue(1); + if (CurLoop->contains(PSearch->getIncomingBlock(0))) + std::swap(SearchStart, SearchIndex); - Value *StartB = PNB->getIncomingValue(0); - Value *IndexB = PNB->getIncomingValue(1); - if (InnerLoop->contains(PNB->getIncomingBlock(0))) - std::swap(StartB, IndexB); + Value *NeedleStart = PNeedle->getIncomingValue(0); + Value *NeedleIndex = PNeedle->getIncomingValue(1); + if (InnerLoop->contains(PNeedle->getIncomingBlock(0))) + std::swap(NeedleStart, NeedleIndex); // Match the GEPs. - if (!match(IndexA, m_GEP(m_Specific(PNA), m_One())) || - !match(IndexB, m_GEP(m_Specific(PNB), m_One()))) + if (!match(SearchIndex, m_GEP(m_Specific(PSearch), m_One())) || + !match(NeedleIndex, m_GEP(m_Specific(PNeedle), m_One()))) return false; // Check the GEPs result type matches `CharTy'. - GetElementPtrInst *GEPA = cast(IndexA); - GetElementPtrInst *GEPB = cast(IndexB); - if (GEPA->getResultElementType() != CharTy || - GEPB->getResultElementType() != CharTy) + GetElementPtrInst *GEPSearch = cast(SearchIndex); + GetElementPtrInst *GEPNeedle = cast(NeedleIndex); + if (GEPSearch->getResultElementType() != CharTy || + GEPNeedle->getResultElementType() != CharTy) return false; // InnerBB should increment the address of the needle pointer. @@ -1131,11 +1136,12 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { // %18 = icmp eq ptr %17, %10 // br i1 %18, label %OuterBB, label %MatchBB BasicBlock *OuterBB; - Value *EndB; + Value *NeedleEnd; if (!match(InnerBB->getTerminator(), - m_Br(m_ICmp(MatchPred, m_Specific(GEPB), m_Value(EndB)), + m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPNeedle), + m_Value(NeedleEnd)), m_BasicBlock(OuterBB), m_Specific(MatchBB))) || - MatchPred != ICmpInst::Predicate::ICMP_EQ || !CurLoop->contains(OuterBB)) + !CurLoop->contains(OuterBB)) return false; // OuterBB should increment the address of the search element pointer. @@ -1145,17 +1151,17 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { // %25 = icmp eq ptr %24, %6 // br i1 %25, label %ExitFail, label %Header BasicBlock *ExitFail; - Value *EndA; + Value *SearchEnd; if (!match(OuterBB->getTerminator(), - m_Br(m_ICmp(MatchPred, m_Specific(GEPA), m_Value(EndA)), - m_BasicBlock(ExitFail), m_Specific(Header))) || - MatchPred != ICmpInst::Predicate::ICMP_EQ) + m_Br(m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(GEPSearch), + m_Value(SearchEnd)), + m_BasicBlock(ExitFail), m_Specific(Header)))) return false; LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n"); - transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, StartA, EndA, - StartB, EndB); + transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart, + SearchEnd, NeedleStart, NeedleEnd); return true; } @@ -1187,6 +1193,8 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // (I) Inner loop where we iterate over the elements of the needle array. // // Overall, the blocks do the following: + // (0) Check if the arrays can't cross page boundaries. If so go to (1), + // otherwise fall back to the original scalar loop. // (1) Load the search array. Go to (2). // (2) (a) Load the needle array. // (b) Splat the first element to the inactive lanes. @@ -1196,8 +1204,9 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // (2), otherwise go to (5). // (5) Check if we've reached the end of the search array. If not loop back to // (1), otherwise exit. - // Block (3) is not part of any loop. Blocks (1,5) and (2,4) belong to the - // outer and inner loops, respectively. + // Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to + // the outer and inner loops, respectively. + BasicBlock *BB0 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); @@ -1209,6 +1218,7 @@ Value *LoopIdiomVectorize::expandFindFirstByte( auto InnerLoop = LI->AllocateLoop(); if (auto ParentLoop = CurLoop->getParentLoop()) { + ParentLoop->addBasicBlockToLoop(BB0, *LI); ParentLoop->addChildLoop(OuterLoop); ParentLoop->addBasicBlockToLoop(BB3, *LI); } else { @@ -1224,24 +1234,46 @@ Value *LoopIdiomVectorize::expandFindFirstByte( InnerLoop->addBasicBlockToLoop(BB2, *LI); InnerLoop->addBasicBlockToLoop(BB4, *LI); - // Set a reference to the old scalar loop and create a predicate of VF - // elements. - Builder.SetInsertPoint(Preheader->getTerminator()); - Value *Pred16 = + // Update the terminator added by SplitBlock to branch to the first block. + Preheader->getTerminator()->setSuccessor(0, BB0); + DTU.applyUpdates({{DominatorTree::Delete, Preheader, SPH}, + {DominatorTree::Insert, Preheader, BB0}}); + + // (0) Check if we could be crossing a page boundary; if so, fallback to the + // old scalar loops. Also create a predicate of VF elements to be used in the + // vector loops. + Builder.SetInsertPoint(BB0); + Value *ISearchStart = Builder.CreatePtrToInt(SearchStart, I64Ty); + Value *ISearchEnd = Builder.CreatePtrToInt(SearchEnd, I64Ty); + Value *INeedleStart = Builder.CreatePtrToInt(NeedleStart, I64Ty); + Value *INeedleEnd = Builder.CreatePtrToInt(NeedleEnd, I64Ty); + Value *PredVF = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, {ConstantInt::get(I64Ty, 0), ConstVF}); - Builder.CreateCondBr(Builder.getFalse(), SPH, BB1); - Preheader->getTerminator()->eraseFromParent(); - DTU.applyUpdates({{DominatorTree::Insert, Preheader, BB1}}); + + const uint64_t MinPageSize = TTI->getMinPageSize().value(); + const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); + Value *SearchStartPage = Builder.CreateLShr(ISearchStart, AddrShiftAmt); + Value *SearchEndPage = Builder.CreateLShr(ISearchEnd, AddrShiftAmt); + Value *NeedleStartPage = Builder.CreateLShr(INeedleStart, AddrShiftAmt); + Value *NeedleEndPage = Builder.CreateLShr(INeedleEnd, AddrShiftAmt); + Value *SearchPageCmp = Builder.CreateICmpNE(SearchStartPage, SearchEndPage); + Value *NeedlePageCmp = Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage); + + Value *CombinedPageCmp = Builder.CreateOr(SearchPageCmp, NeedlePageCmp); + BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1); + CombinedPageBr->setMetadata(LLVMContext::MD_prof, + MDBuilder(Ctx).createBranchWeights(10, 90)); + DTU.applyUpdates( + {{DominatorTree::Insert, BB0, SPH}, {DominatorTree::Insert, BB0, BB1}}); // (1) Load the search array and branch to the inner loop. Builder.SetInsertPoint(BB1); PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch"); - Value *PredSearch = - Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, - {Builder.CreatePointerCast(Search, I64Ty), - Builder.CreatePointerCast(SearchEnd, I64Ty)}); - PredSearch = Builder.CreateAnd(Pred16, PredSearch); + Value *PredSearch = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}); + PredSearch = Builder.CreateAnd(PredVF, PredSearch); Value *LoadSearch = Builder.CreateMaskedLoad(CharVTy, Search, Align(1), PredSearch, Passthru); Builder.CreateBr(BB2); @@ -1252,11 +1284,10 @@ Value *LoopIdiomVectorize::expandFindFirstByte( PHINode *Needle = Builder.CreatePHI(PtrTy, 2, "pneedle"); // (2.a) Load the needle array. - Value *PredNeedle = - Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, - {Builder.CreatePointerCast(Needle, I64Ty), - Builder.CreatePointerCast(NeedleEnd, I64Ty)}); - PredNeedle = Builder.CreateAnd(Pred16, PredNeedle); + Value *PredNeedle = Builder.CreateIntrinsic( + Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, + {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}); + PredNeedle = Builder.CreateAnd(PredVF, PredNeedle); Value *LoadNeedle = Builder.CreateMaskedLoad(CharVTy, Needle, Align(1), PredNeedle, Passthru); @@ -1279,10 +1310,12 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // (3) We found a match. Compute the index of its location and exit. Builder.SetInsertPoint(BB3); + PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1); + PHINode *MatchPredLCSSA = Builder.CreatePHI(MatchPred->getType(), 1); Value *MatchCnt = Builder.CreateIntrinsic( Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()}, - {MatchPred, /*ZeroIsPoison=*/Builder.getInt1(true)}); - Value *MatchVal = Builder.CreateGEP(CharTy, Search, MatchCnt); + {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}); + Value *MatchVal = Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt); Builder.CreateBr(ExitSucc); DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}}); @@ -1301,11 +1334,14 @@ Value *LoopIdiomVectorize::expandFindFirstByte( DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1}, {DominatorTree::Insert, BB5, ExitFail}}); - // Set up the PHI's. - Search->addIncoming(SearchStart, Preheader); + // Set up the PHI nodes. + Search->addIncoming(SearchStart, BB0); Search->addIncoming(NextSearch, BB5); Needle->addIncoming(NeedleStart, BB1); Needle->addIncoming(NextNeedle, BB4); + // These are needed to retain LCSSA form. + MatchLCSSA->addIncoming(Search, BB2); + MatchPredLCSSA->addIncoming(MatchPred, BB2); if (VerifyLoops) { OuterLoop->verifyLoop(); @@ -1332,11 +1368,16 @@ void LoopIdiomVectorize::transformFindFirstByte( expandFindFirstByte(Builder, DTU, VF, CharTy, ExitSucc, ExitFail, SearchStart, SearchEnd, NeedleStart, NeedleEnd); + assert(PHBranch->isUnconditional() && + "Expected preheader to terminate with an unconditional branch."); + // Add new incoming values with the result of the transformation to PHINodes // of ExitSucc that use IndPhi. - for (auto *U : llvm::make_early_inc_range(IndPhi->users())) - if (auto *PN = dyn_cast(U); PN && PN->getParent() == ExitSucc) + for (auto *U : llvm::make_early_inc_range(IndPhi->users())) { + auto *PN = dyn_cast(U); + if (PN && PN->getParent() == ExitSucc) PN->addIncoming(MatchVal, cast(MatchVal)->getParent()); + } if (VerifyLoops && CurLoop->getParentLoop()) { CurLoop->getParentLoop()->verifyLoop(); diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/CodeGen/AArch64/find-first-byte.ll index 202ee0982d95a..b7d24c0012aba 100644 --- a/llvm/test/CodeGen/AArch64/find-first-byte.ll +++ b/llvm/test/CodeGen/AArch64/find-first-byte.ll @@ -16,72 +16,85 @@ define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]] ; CHECK: [[_PREHEADER:.*:]] -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]] -; CHECK: [[BB9]]: -; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP10]], i64 [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = and [[TMP8]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[TMP13]], zeroinitializer) -; CHECK-NEXT: br label %[[BB15:.*]] -; CHECK: [[BB15]]: -; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] -; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP16]], i64 [[TMP17]]) -; CHECK-NEXT: [[TMP19:%.*]] = and [[TMP8]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[TMP19]], zeroinitializer) -; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP21]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = select [[TMP19]], [[TMP20]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[TMP22]], i64 0) -; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[TMP14]], <16 x i8> [[TMP23]], [[TMP13]]) -; CHECK-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP24]]) -; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]] +; CHECK-NEXT: br label %[[BB8:.*]] +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP9]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP12]], 12 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0:![0-9]+]] +; CHECK: [[BB21]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP22]], i64 [[TMP10]]) +; CHECK-NEXT: [[TMP24:%.*]] = and [[TMP13]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[TMP24]], zeroinitializer) +; CHECK-NEXT: br label %[[BB26:.*]] ; CHECK: [[BB26]]: -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP24]], i1 true) -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[PSEARCH]], i64 [[TMP27]] +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ] +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[TMP28:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP27]], i64 [[TMP12]]) +; CHECK-NEXT: [[TMP29:%.*]] = and [[TMP13]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[TMP29]], zeroinitializer) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP31]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = select [[TMP29]], [[TMP30]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP33:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[TMP32]], i64 0) +; CHECK-NEXT: [[TMP34:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[TMP25]], <16 x i8> [[TMP33]], [[TMP24]]) +; CHECK-NEXT: [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP34]]) +; CHECK-NEXT: br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]] +; CHECK: [[BB36]]: +; CHECK-NEXT: [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ] +; CHECK-NEXT: [[TMP38:%.*]] = phi [ [[TMP34]], %[[BB26]] ] +; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP38]], i1 true) +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[TMP37]], i64 [[TMP39]] ; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] -; CHECK: [[TMP29]]: -; CHECK-NEXT: [[TMP30]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 -; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]] -; CHECK: [[TMP32]]: -; CHECK-NEXT: [[TMP33]] = getelementptr i8, ptr [[PSEARCH]], i64 16 -; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]] +; CHECK: [[TMP41]]: +; CHECK-NEXT: [[TMP42]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 +; CHECK-NEXT: [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]] +; CHECK: [[TMP44]]: +; CHECK-NEXT: [[TMP45]] = getelementptr i8, ptr [[PSEARCH]], i64 16 +; CHECK-NEXT: [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: br label %[[BB35:.*]] -; CHECK: [[BB35]]: -; CHECK-NEXT: [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP37:%.*]] = load i8, ptr [[TMP36]], align 1 -; CHECK-NEXT: br label %[[BB41:.*]] -; CHECK: [[BB38:.*]]: -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[TMP42:%.*]], i64 1 -; CHECK-NEXT: [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]] -; CHECK: [[BB41]]: -; CHECK-NEXT: [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ] -; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr [[TMP42]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i8 [[TMP37]], [[TMP43]] -; CHECK-NEXT: br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]] -; CHECK: [[TMP45]]: -; CHECK-NEXT: [[TMP46]] = getelementptr inbounds i8, ptr [[TMP36]], i64 1 -; CHECK-NEXT: [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]] +; CHECK-NEXT: br label %[[BB47:.*]] +; CHECK: [[BB47]]: +; CHECK-NEXT: [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 +; CHECK-NEXT: br label %[[BB53:.*]] +; CHECK: [[BB50:.*]]: +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP54:%.*]], i64 1 +; CHECK-NEXT: [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]] +; CHECK: [[BB53]]: +; CHECK-NEXT: [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ] +; CHECK-NEXT: [[TMP55:%.*]] = load i8, ptr [[TMP54]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i8 [[TMP49]], [[TMP55]] +; CHECK-NEXT: br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]] +; CHECK: [[TMP57]]: +; CHECK-NEXT: [[TMP58]] = getelementptr inbounds i8, ptr [[TMP48]], i64 1 +; CHECK-NEXT: [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]] ; CHECK: [[_LOOPEXIT:.*:]] -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ] -; CHECK-NEXT: br label %[[BB48]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ] +; CHECK-NEXT: br label %[[BB60]] ; CHECK: [[_LOOPEXIT1:.*:]] -; CHECK-NEXT: br label %[[BB48]] -; CHECK: [[BB48]]: -; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: ret ptr [[TMP49]] +; CHECK-NEXT: br label %[[BB60]] +; CHECK: [[BB60]]: +; CHECK-NEXT: [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[TMP61]] ; ; DISABLE-LABEL: define ptr @find_first_of_i8( ; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { @@ -157,72 +170,85 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[BB48:.*]], label %[[DOTPREHEADER:.*]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]] ; CHECK: [[_PREHEADER:.*:]] -; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) -; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[BB9:.*]] -; CHECK: [[BB9]]: -; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[DOTPREHEADER]] ], [ [[TMP33:%.*]], %[[TMP32:.*]] ] -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP1]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP10]], i64 [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = and [[TMP8]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, [[TMP13]], zeroinitializer) -; CHECK-NEXT: br label %[[BB15:.*]] -; CHECK: [[BB15]]: -; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB9]] ], [ [[TMP30:%.*]], %[[TMP29:.*]] ] -; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP3]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP16]], i64 [[TMP17]]) -; CHECK-NEXT: [[TMP19:%.*]] = and [[TMP8]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, [[TMP19]], zeroinitializer) -; CHECK-NEXT: [[TMP21:%.*]] = extractelement [[TMP20]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[TMP21]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP22:%.*]] = select [[TMP19]], [[TMP20]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP23:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16( [[TMP22]], i64 0) -; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.experimental.vector.match.nxv8i16.v8i16( [[TMP14]], <8 x i16> [[TMP23]], [[TMP13]]) -; CHECK-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1( [[TMP24]]) -; CHECK-NEXT: br i1 [[TMP25]], label %[[BB26:.*]], label %[[TMP29]] +; CHECK-NEXT: br label %[[BB8:.*]] +; CHECK: [[BB8]]: +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP9]], 12 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP10]], 12 +; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP11]], 12 +; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP12]], 12 +; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]] +; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] +; CHECK-NEXT: br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0]] +; CHECK: [[BB21]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP22]], i64 [[TMP10]]) +; CHECK-NEXT: [[TMP24:%.*]] = and [[TMP13]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, [[TMP24]], zeroinitializer) +; CHECK-NEXT: br label %[[BB26:.*]] ; CHECK: [[BB26]]: -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( [[TMP24]], i1 true) -; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i16, ptr [[PSEARCH]], i64 [[TMP27]] +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ] +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[TMP28:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP27]], i64 [[TMP12]]) +; CHECK-NEXT: [[TMP29:%.*]] = and [[TMP13]], [[TMP28]] +; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, [[TMP29]], zeroinitializer) +; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[TMP31]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[TMP32:%.*]] = select [[TMP29]], [[TMP30]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP33:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16( [[TMP32]], i64 0) +; CHECK-NEXT: [[TMP34:%.*]] = call @llvm.experimental.vector.match.nxv8i16.v8i16( [[TMP25]], <8 x i16> [[TMP33]], [[TMP24]]) +; CHECK-NEXT: [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1( [[TMP34]]) +; CHECK-NEXT: br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]] +; CHECK: [[BB36]]: +; CHECK-NEXT: [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ] +; CHECK-NEXT: [[TMP38:%.*]] = phi [ [[TMP34]], %[[BB26]] ] +; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( [[TMP38]], i1 true) +; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i16, ptr [[TMP37]], i64 [[TMP39]] ; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] -; CHECK: [[TMP29]]: -; CHECK-NEXT: [[TMP30]] = getelementptr i16, ptr [[PNEEDLE]], i64 8 -; CHECK-NEXT: [[TMP31:%.*]] = icmp ult ptr [[TMP30]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP31]], label %[[BB15]], label %[[TMP32]] -; CHECK: [[TMP32]]: -; CHECK-NEXT: [[TMP33]] = getelementptr i16, ptr [[PSEARCH]], i64 8 -; CHECK-NEXT: [[TMP34:%.*]] = icmp ult ptr [[TMP33]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP34]], label %[[BB9]], label %[[DOTLOOPEXIT1:.*]] +; CHECK: [[TMP41]]: +; CHECK-NEXT: [[TMP42]] = getelementptr i16, ptr [[PNEEDLE]], i64 8 +; CHECK-NEXT: [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]] +; CHECK: [[TMP44]]: +; CHECK-NEXT: [[TMP45]] = getelementptr i16, ptr [[PSEARCH]], i64 8 +; CHECK-NEXT: [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]] ; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: br label %[[BB35:.*]] -; CHECK: [[BB35]]: -; CHECK-NEXT: [[TMP36:%.*]] = phi ptr [ [[TMP46:%.*]], %[[TMP45:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP37:%.*]] = load i16, ptr [[TMP36]], align 1 -; CHECK-NEXT: br label %[[BB41:.*]] -; CHECK: [[BB38:.*]]: -; CHECK-NEXT: [[TMP39:%.*]] = getelementptr inbounds i16, ptr [[TMP42:%.*]], i64 1 -; CHECK-NEXT: [[TMP40:%.*]] = icmp eq ptr [[TMP39]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP40]], label %[[TMP45]], label %[[BB41]] -; CHECK: [[BB41]]: -; CHECK-NEXT: [[TMP42]] = phi ptr [ [[TMP2]], %[[BB35]] ], [ [[TMP39]], %[[BB38]] ] -; CHECK-NEXT: [[TMP43:%.*]] = load i16, ptr [[TMP42]], align 1 -; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i16 [[TMP37]], [[TMP43]] -; CHECK-NEXT: br i1 [[TMP44]], label %[[DOTLOOPEXIT]], label %[[BB38]] -; CHECK: [[TMP45]]: -; CHECK-NEXT: [[TMP46]] = getelementptr inbounds i16, ptr [[TMP36]], i64 1 -; CHECK-NEXT: [[TMP47:%.*]] = icmp eq ptr [[TMP46]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP47]], label %[[DOTLOOPEXIT1]], label %[[BB35]] +; CHECK-NEXT: br label %[[BB47:.*]] +; CHECK: [[BB47]]: +; CHECK-NEXT: [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] +; CHECK-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP48]], align 1 +; CHECK-NEXT: br label %[[BB53:.*]] +; CHECK: [[BB50:.*]]: +; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[TMP54:%.*]], i64 1 +; CHECK-NEXT: [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]] +; CHECK-NEXT: br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]] +; CHECK: [[BB53]]: +; CHECK-NEXT: [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ] +; CHECK-NEXT: [[TMP55:%.*]] = load i16, ptr [[TMP54]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i16 [[TMP49]], [[TMP55]] +; CHECK-NEXT: br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]] +; CHECK: [[TMP57]]: +; CHECK-NEXT: [[TMP58]] = getelementptr inbounds i16, ptr [[TMP48]], i64 1 +; CHECK-NEXT: [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]] +; CHECK-NEXT: br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]] ; CHECK: [[_LOOPEXIT:.*:]] -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP36]], %[[BB41]] ], [ [[TMP28]], %[[BB26]] ] -; CHECK-NEXT: br label %[[BB48]] +; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ] +; CHECK-NEXT: br label %[[BB60]] ; CHECK: [[_LOOPEXIT1:.*:]] -; CHECK-NEXT: br label %[[BB48]] -; CHECK: [[BB48]]: -; CHECK-NEXT: [[TMP49:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: ret ptr [[TMP49]] +; CHECK-NEXT: br label %[[BB60]] +; CHECK: [[BB60]]: +; CHECK-NEXT: [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[TMP61]] ; ; DISABLE-LABEL: define ptr @find_first_of_i16( ; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { @@ -414,3 +440,6 @@ define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 { } attributes #0 = { "target-features"="+sve2" } +;. +; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90} +;. From ab3b6464990eb54216772c8baa963e9c514522c9 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Fri, 31 Jan 2025 09:08:29 +0000 Subject: [PATCH 4/7] Move tests to llvm/test/Transforms/LoopIdiom/AArch64 --- .../{CodeGen => Transforms/LoopIdiom}/AArch64/find-first-byte.ll | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename llvm/test/{CodeGen => Transforms/LoopIdiom}/AArch64/find-first-byte.ll (100%) diff --git a/llvm/test/CodeGen/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll similarity index 100% rename from llvm/test/CodeGen/AArch64/find-first-byte.ll rename to llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll From 38860a86c6f444a446f1f00cd110784123f09f60 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Tue, 4 Feb 2025 13:12:22 +0000 Subject: [PATCH 5/7] Add names to blocks and variables --- .../Vectorize/LoopIdiomVectorize.cpp | 106 ++- .../LoopIdiom/AArch64/find-first-byte.ll | 777 +++++++++--------- 2 files changed, 458 insertions(+), 425 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 7c42cdf056a93..44fe5ba3a0bfd 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -1184,7 +1184,7 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // Split block in the original loop preheader. // SPH is the new preheader to the old scalar loop. BasicBlock *SPH = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI, - nullptr, "scalar_ph"); + nullptr, "scalar_preheader"); // Create the blocks that we're going to use. // @@ -1206,12 +1206,17 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // (1), otherwise exit. // Blocks (0,3) are not part of any loop. Blocks (1,5) and (2,4) belong to // the outer and inner loops, respectively. - BasicBlock *BB0 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); - BasicBlock *BB1 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); - BasicBlock *BB2 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); - BasicBlock *BB3 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); - BasicBlock *BB4 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); - BasicBlock *BB5 = BasicBlock::Create(Ctx, "", SPH->getParent(), SPH); + BasicBlock *BB0 = BasicBlock::Create(Ctx, "mem_check", SPH->getParent(), SPH); + BasicBlock *BB1 = + BasicBlock::Create(Ctx, "find_first_vec_header", SPH->getParent(), SPH); + BasicBlock *BB2 = + BasicBlock::Create(Ctx, "match_check_vec", SPH->getParent(), SPH); + BasicBlock *BB3 = + BasicBlock::Create(Ctx, "calculate_match", SPH->getParent(), SPH); + BasicBlock *BB4 = + BasicBlock::Create(Ctx, "needle_check_vec", SPH->getParent(), SPH); + BasicBlock *BB5 = + BasicBlock::Create(Ctx, "search_check_vec", SPH->getParent(), SPH); // Update LoopInfo with the new loops. auto OuterLoop = LI->AllocateLoop(); @@ -1243,24 +1248,35 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // old scalar loops. Also create a predicate of VF elements to be used in the // vector loops. Builder.SetInsertPoint(BB0); - Value *ISearchStart = Builder.CreatePtrToInt(SearchStart, I64Ty); - Value *ISearchEnd = Builder.CreatePtrToInt(SearchEnd, I64Ty); - Value *INeedleStart = Builder.CreatePtrToInt(NeedleStart, I64Ty); - Value *INeedleEnd = Builder.CreatePtrToInt(NeedleEnd, I64Ty); + Value *ISearchStart = + Builder.CreatePtrToInt(SearchStart, I64Ty, "search_start_int"); + Value *ISearchEnd = + Builder.CreatePtrToInt(SearchEnd, I64Ty, "search_end_int"); + Value *INeedleStart = + Builder.CreatePtrToInt(NeedleStart, I64Ty, "needle_start_int"); + Value *INeedleEnd = + Builder.CreatePtrToInt(NeedleEnd, I64Ty, "needle_end_int"); Value *PredVF = Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, {ConstantInt::get(I64Ty, 0), ConstVF}); const uint64_t MinPageSize = TTI->getMinPageSize().value(); const uint64_t AddrShiftAmt = llvm::Log2_64(MinPageSize); - Value *SearchStartPage = Builder.CreateLShr(ISearchStart, AddrShiftAmt); - Value *SearchEndPage = Builder.CreateLShr(ISearchEnd, AddrShiftAmt); - Value *NeedleStartPage = Builder.CreateLShr(INeedleStart, AddrShiftAmt); - Value *NeedleEndPage = Builder.CreateLShr(INeedleEnd, AddrShiftAmt); - Value *SearchPageCmp = Builder.CreateICmpNE(SearchStartPage, SearchEndPage); - Value *NeedlePageCmp = Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage); - - Value *CombinedPageCmp = Builder.CreateOr(SearchPageCmp, NeedlePageCmp); + Value *SearchStartPage = + Builder.CreateLShr(ISearchStart, AddrShiftAmt, "search_start_page"); + Value *SearchEndPage = + Builder.CreateLShr(ISearchEnd, AddrShiftAmt, "search_end_page"); + Value *NeedleStartPage = + Builder.CreateLShr(INeedleStart, AddrShiftAmt, "needle_start_page"); + Value *NeedleEndPage = + Builder.CreateLShr(INeedleEnd, AddrShiftAmt, "needle_end_page"); + Value *SearchPageCmp = + Builder.CreateICmpNE(SearchStartPage, SearchEndPage, "search_page_cmp"); + Value *NeedlePageCmp = + Builder.CreateICmpNE(NeedleStartPage, NeedleEndPage, "needle_page_cmp"); + + Value *CombinedPageCmp = + Builder.CreateOr(SearchPageCmp, NeedlePageCmp, "combined_page_cmp"); BranchInst *CombinedPageBr = Builder.CreateCondBr(CombinedPageCmp, SPH, BB1); CombinedPageBr->setMetadata(LLVMContext::MD_prof, MDBuilder(Ctx).createBranchWeights(10, 90)); @@ -1272,10 +1288,11 @@ Value *LoopIdiomVectorize::expandFindFirstByte( PHINode *Search = Builder.CreatePHI(PtrTy, 2, "psearch"); Value *PredSearch = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, - {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}); - PredSearch = Builder.CreateAnd(PredVF, PredSearch); - Value *LoadSearch = - Builder.CreateMaskedLoad(CharVTy, Search, Align(1), PredSearch, Passthru); + {Builder.CreatePtrToInt(Search, I64Ty), ISearchEnd}, nullptr, + "search_pred"); + PredSearch = Builder.CreateAnd(PredVF, PredSearch, "search_masked"); + Value *LoadSearch = Builder.CreateMaskedLoad( + CharVTy, Search, Align(1), PredSearch, Passthru, "search_load_vec"); Builder.CreateBr(BB2); DTU.applyUpdates({{DominatorTree::Insert, BB1, BB2}}); @@ -1286,23 +1303,27 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // (2.a) Load the needle array. Value *PredNeedle = Builder.CreateIntrinsic( Intrinsic::get_active_lane_mask, {PredVTy, I64Ty}, - {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}); - PredNeedle = Builder.CreateAnd(PredVF, PredNeedle); - Value *LoadNeedle = - Builder.CreateMaskedLoad(CharVTy, Needle, Align(1), PredNeedle, Passthru); + {Builder.CreatePtrToInt(Needle, I64Ty), INeedleEnd}, nullptr, + "needle_pred"); + PredNeedle = Builder.CreateAnd(PredVF, PredNeedle, "needle_masked"); + Value *LoadNeedle = Builder.CreateMaskedLoad( + CharVTy, Needle, Align(1), PredNeedle, Passthru, "needle_load_vec"); // (2.b) Splat the first element to the inactive lanes. - Value *Needle0 = Builder.CreateExtractElement(LoadNeedle, uint64_t(0)); - Value *Needle0Splat = - Builder.CreateVectorSplat(ElementCount::getScalable(VF), Needle0); - LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat); - LoadNeedle = Builder.CreateExtractVector( - FixedVectorType::get(CharTy, VF), LoadNeedle, ConstantInt::get(I64Ty, 0)); + Value *Needle0 = + Builder.CreateExtractElement(LoadNeedle, uint64_t(0), "needle0"); + Value *Needle0Splat = Builder.CreateVectorSplat(ElementCount::getScalable(VF), + Needle0, "needle0"); + LoadNeedle = Builder.CreateSelect(PredNeedle, LoadNeedle, Needle0Splat, + "needle_splat"); + LoadNeedle = + Builder.CreateExtractVector(FixedVectorType::get(CharTy, VF), LoadNeedle, + ConstantInt::get(I64Ty, 0), "needle_vec"); // (2.c) Test if there's a match. Value *MatchPred = Builder.CreateIntrinsic( Intrinsic::experimental_vector_match, {CharVTy, LoadNeedle->getType()}, - {LoadSearch, LoadNeedle, PredSearch}); + {LoadSearch, LoadNeedle, PredSearch}, nullptr, "match_pred"); Value *IfAnyMatch = Builder.CreateOrReduce(MatchPred); Builder.CreateCondBr(IfAnyMatch, BB3, BB4); DTU.applyUpdates( @@ -1310,25 +1331,30 @@ Value *LoopIdiomVectorize::expandFindFirstByte( // (3) We found a match. Compute the index of its location and exit. Builder.SetInsertPoint(BB3); - PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1); - PHINode *MatchPredLCSSA = Builder.CreatePHI(MatchPred->getType(), 1); + PHINode *MatchLCSSA = Builder.CreatePHI(PtrTy, 1, "match_start"); + PHINode *MatchPredLCSSA = + Builder.CreatePHI(MatchPred->getType(), 1, "match_vec"); Value *MatchCnt = Builder.CreateIntrinsic( Intrinsic::experimental_cttz_elts, {I64Ty, MatchPred->getType()}, - {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}); - Value *MatchVal = Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt); + {MatchPredLCSSA, /*ZeroIsPoison=*/Builder.getInt1(true)}, nullptr, + "match_idx"); + Value *MatchVal = + Builder.CreateGEP(CharTy, MatchLCSSA, MatchCnt, "match_res"); Builder.CreateBr(ExitSucc); DTU.applyUpdates({{DominatorTree::Insert, BB3, ExitSucc}}); // (4) Check if we've reached the end of the needle array. Builder.SetInsertPoint(BB4); - Value *NextNeedle = Builder.CreateGEP(CharTy, Needle, ConstVF); + Value *NextNeedle = + Builder.CreateGEP(CharTy, Needle, ConstVF, "needle_next_vec"); Builder.CreateCondBr(Builder.CreateICmpULT(NextNeedle, NeedleEnd), BB2, BB5); DTU.applyUpdates( {{DominatorTree::Insert, BB4, BB2}, {DominatorTree::Insert, BB4, BB5}}); // (5) Check if we've reached the end of the search array. Builder.SetInsertPoint(BB5); - Value *NextSearch = Builder.CreateGEP(CharTy, Search, ConstVF); + Value *NextSearch = + Builder.CreateGEP(CharTy, Search, ConstVF, "search_next_vec"); Builder.CreateCondBr(Builder.CreateICmpULT(NextSearch, SearchEnd), BB1, ExitFail); DTU.applyUpdates({{DominatorTree::Insert, BB5, BB1}, diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll index b7d24c0012aba..92cde4d27f2c0 100644 --- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll +++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll @@ -10,309 +10,314 @@ ; return first; ; return last; ; } -define ptr @find_first_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +define ptr @find_first_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { ; CHECK-LABEL: define ptr @find_first_of_i8( -; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]] -; CHECK: [[_PREHEADER:.*:]] -; CHECK-NEXT: br label %[[BB8:.*]] -; CHECK: [[BB8]]: -; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) -; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP9]], 12 -; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP10]], 12 -; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP11]], 12 -; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP12]], 12 -; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0:![0-9]+]] -; CHECK: [[BB21]]: -; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ] -; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 -; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP22]], i64 [[TMP10]]) -; CHECK-NEXT: [[TMP24:%.*]] = and [[TMP13]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[TMP24]], zeroinitializer) -; CHECK-NEXT: br label %[[BB26:.*]] -; CHECK: [[BB26]]: -; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ] -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 -; CHECK-NEXT: [[TMP28:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP27]], i64 [[TMP12]]) -; CHECK-NEXT: [[TMP29:%.*]] = and [[TMP13]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[TMP29]], zeroinitializer) -; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP31]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP32:%.*]] = select [[TMP29]], [[TMP30]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP33:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[TMP32]], i64 0) -; CHECK-NEXT: [[TMP34:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[TMP25]], <16 x i8> [[TMP33]], [[TMP24]]) -; CHECK-NEXT: [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[TMP34]]) -; CHECK-NEXT: br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]] -; CHECK: [[BB36]]: -; CHECK-NEXT: [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ] -; CHECK-NEXT: [[TMP38:%.*]] = phi [ [[TMP34]], %[[BB26]] ] -; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[TMP38]], i1 true) -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i8, ptr [[TMP37]], i64 [[TMP39]] -; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] -; CHECK: [[TMP41]]: -; CHECK-NEXT: [[TMP42]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 -; CHECK-NEXT: [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]] -; CHECK: [[TMP44]]: -; CHECK-NEXT: [[TMP45]] = getelementptr i8, ptr [[PSEARCH]], i64 16 -; CHECK-NEXT: [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: br label %[[BB47:.*]] -; CHECK: [[BB47]]: -; CHECK-NEXT: [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP49:%.*]] = load i8, ptr [[TMP48]], align 1 -; CHECK-NEXT: br label %[[BB53:.*]] -; CHECK: [[BB50:.*]]: -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i8, ptr [[TMP54:%.*]], i64 1 -; CHECK-NEXT: [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]] -; CHECK: [[BB53]]: -; CHECK-NEXT: [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ] -; CHECK-NEXT: [[TMP55:%.*]] = load i8, ptr [[TMP54]], align 1 -; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i8 [[TMP49]], [[TMP55]] -; CHECK-NEXT: br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]] -; CHECK: [[TMP57]]: -; CHECK-NEXT: [[TMP58]] = getelementptr inbounds i8, ptr [[TMP48]], i64 1 -; CHECK-NEXT: [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]] -; CHECK: [[_LOOPEXIT:.*:]] -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ] -; CHECK-NEXT: br label %[[BB60]] -; CHECK: [[_LOOPEXIT1:.*:]] -; CHECK-NEXT: br label %[[BB60]] -; CHECK: [[BB60]]: -; CHECK-NEXT: [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: ret ptr [[TMP61]] +; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; CHECK: [[HEADER_PREHEADER]]: +; CHECK-NEXT: br label %[[MEM_CHECK:.*]] +; CHECK: [[MEM_CHECK]]: +; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64 +; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64 +; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64 +; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) +; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12 +; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12 +; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12 +; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12 +; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]] +; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]] +; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]] +; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0:![0-9]+]] +; CHECK: [[FIND_FIRST_VEC_HEADER]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]]) +; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and [[TMP0]], [[SEARCH_PRED]] +; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[SEARCH_MASKED]], zeroinitializer) +; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]] +; CHECK: [[MATCH_CHECK_VEC]]: +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]]) +; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and [[TMP0]], [[NEEDLE_PRED]] +; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[NEEDLE_MASKED]], zeroinitializer) +; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement [[NEEDLE_LOAD_VEC]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement poison, i8 [[NEEDLE0]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector [[NEEDLE0_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select [[NEEDLE_MASKED]], [[NEEDLE_LOAD_VEC]], [[NEEDLE0_SPLAT]] +; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[NEEDLE_SPLAT]], i64 0) +; CHECK-NEXT: [[MATCH_PRED:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], [[SEARCH_MASKED]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[MATCH_PRED]]) +; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]] +; CHECK: [[CALCULATE_MATCH]]: +; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[MATCH_VEC]], i1 true) +; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]] +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] +; CHECK: [[NEEDLE_CHECK_VEC]]: +; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]] +; CHECK: [[SEARCH_CHECK_VEC]]: +; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]] +; CHECK: [[SCALAR_PREHEADER]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ] +; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; CHECK-NEXT: br label %[[MATCH_CHECK:.*]] +; CHECK: [[NEEDLE_CHECK:.*]]: +; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; CHECK: [[MATCH_CHECK]]: +; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]] +; CHECK: [[SEARCH_CHECK]]: +; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT_LOOPEXIT1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[RES]] ; ; DISABLE-LABEL: define ptr @find_first_of_i8( -; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0:[0-9]+]] { -; DISABLE-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] -; DISABLE-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] -; DISABLE-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; DISABLE-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] -; DISABLE: [[_PREHEADER:.*:]] -; DISABLE-NEXT: br label %[[BB8:.*]] -; DISABLE: [[BB8]]: -; DISABLE-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] -; DISABLE-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP9]], align 1 -; DISABLE-NEXT: br label %[[BB14:.*]] -; DISABLE: [[BB11:.*]]: -; DISABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP15:%.*]], i64 1 -; DISABLE-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] -; DISABLE-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] -; DISABLE: [[BB14]]: -; DISABLE-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] -; DISABLE-NEXT: [[TMP16:%.*]] = load i8, ptr [[TMP15]], align 1 -; DISABLE-NEXT: [[TMP17:%.*]] = icmp eq i8 [[TMP10]], [[TMP16]] -; DISABLE-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] -; DISABLE: [[TMP18]]: -; DISABLE-NEXT: [[TMP19]] = getelementptr inbounds i8, ptr [[TMP9]], i64 1 -; DISABLE-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] -; DISABLE-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] -; DISABLE: [[_LOOPEXIT:.*:]] -; DISABLE-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] -; DISABLE-NEXT: br label %[[BB21]] -; DISABLE: [[_LOOPEXIT1:.*:]] -; DISABLE-NEXT: br label %[[BB21]] -; DISABLE: [[BB21]]: -; DISABLE-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; DISABLE-NEXT: ret ptr [[TMP22]] +; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0:[0-9]+]] { +; DISABLE-NEXT: [[ENTRY:.*]]: +; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; DISABLE: [[HEADER_PREHEADER]]: +; DISABLE-NEXT: br label %[[HEADER:.*]] +; DISABLE: [[HEADER]]: +; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ] +; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]] +; DISABLE: [[NEEDLE_CHECK:.*]]: +; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; DISABLE: [[MATCH_CHECK]]: +; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]] +; DISABLE: [[SEARCH_CHECK]]: +; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]] +; DISABLE: [[EXIT_LOOPEXIT]]: +; DISABLE-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ] +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT_LOOPEXIT1]]: +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT]]: +; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; DISABLE-NEXT: ret ptr [[RES]] ; - %5 = icmp eq ptr %0, %1 - %6 = icmp eq ptr %2, %3 - %7 = or i1 %5, %6 - br i1 %7, label %21, label %8 - -8: - %9 = phi ptr [ %19, %18 ], [ %0, %4 ] - %10 = load i8, ptr %9, align 1 - br label %14 - -11: - %12 = getelementptr inbounds i8, ptr %15, i64 1 - %13 = icmp eq ptr %12, %3 - br i1 %13, label %18, label %14 - -14: - %15 = phi ptr [ %2, %8 ], [ %12, %11 ] - %16 = load i8, ptr %15, align 1 - %17 = icmp eq i8 %10, %16 - br i1 %17, label %21, label %11 - -18: - %19 = getelementptr inbounds i8, ptr %9, i64 1 - %20 = icmp eq ptr %19, %1 - br i1 %20, label %21, label %8 - -21: - %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] - ret ptr %22 +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res } -; Same as @find_first_of_i8 but with i16. +; Equivalent to @find_first_of_i8 but with i16. ; This is accepted and generates a similar loop. -define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { -; +define ptr @find_first_of_i16(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { ; CHECK-LABEL: define ptr @find_first_of_i16( -; CHECK-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] -; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[BB60:.*]], [[DOTPREHEADER:label %.*]] -; CHECK: [[_PREHEADER:.*:]] -; CHECK-NEXT: br label %[[BB8:.*]] -; CHECK: [[BB8]]: -; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr [[TMP0]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint ptr [[TMP1]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP2]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr [[TMP3]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) -; CHECK-NEXT: [[TMP14:%.*]] = lshr i64 [[TMP9]], 12 -; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP10]], 12 -; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP11]], 12 -; CHECK-NEXT: [[TMP17:%.*]] = lshr i64 [[TMP12]], 12 -; CHECK-NEXT: [[TMP18:%.*]] = icmp ne i64 [[TMP14]], [[TMP15]] -; CHECK-NEXT: [[TMP19:%.*]] = icmp ne i64 [[TMP16]], [[TMP17]] -; CHECK-NEXT: [[TMP20:%.*]] = or i1 [[TMP18]], [[TMP19]] -; CHECK-NEXT: br i1 [[TMP20]], label %[[SCALAR_PH:.*]], label %[[BB21:.*]], !prof [[PROF0]] -; CHECK: [[BB21]]: -; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[TMP0]], %[[BB8]] ], [ [[TMP45:%.*]], %[[TMP44:.*]] ] -; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 -; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP22]], i64 [[TMP10]]) -; CHECK-NEXT: [[TMP24:%.*]] = and [[TMP13]], [[TMP23]] -; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, [[TMP24]], zeroinitializer) -; CHECK-NEXT: br label %[[BB26:.*]] -; CHECK: [[BB26]]: -; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[TMP2]], %[[BB21]] ], [ [[TMP42:%.*]], %[[TMP41:.*]] ] -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 -; CHECK-NEXT: [[TMP28:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP27]], i64 [[TMP12]]) -; CHECK-NEXT: [[TMP29:%.*]] = and [[TMP13]], [[TMP28]] -; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, [[TMP29]], zeroinitializer) -; CHECK-NEXT: [[TMP31:%.*]] = extractelement [[TMP30]], i64 0 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i16 [[TMP31]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP32:%.*]] = select [[TMP29]], [[TMP30]], [[DOTSPLAT]] -; CHECK-NEXT: [[TMP33:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16( [[TMP32]], i64 0) -; CHECK-NEXT: [[TMP34:%.*]] = call @llvm.experimental.vector.match.nxv8i16.v8i16( [[TMP25]], <8 x i16> [[TMP33]], [[TMP24]]) -; CHECK-NEXT: [[TMP35:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1( [[TMP34]]) -; CHECK-NEXT: br i1 [[TMP35]], label %[[BB36:.*]], label %[[TMP41]] -; CHECK: [[BB36]]: -; CHECK-NEXT: [[TMP37:%.*]] = phi ptr [ [[PSEARCH]], %[[BB26]] ] -; CHECK-NEXT: [[TMP38:%.*]] = phi [ [[TMP34]], %[[BB26]] ] -; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( [[TMP38]], i1 true) -; CHECK-NEXT: [[TMP40:%.*]] = getelementptr i16, ptr [[TMP37]], i64 [[TMP39]] -; CHECK-NEXT: br label %[[DOTLOOPEXIT:.*]] -; CHECK: [[TMP41]]: -; CHECK-NEXT: [[TMP42]] = getelementptr i16, ptr [[PNEEDLE]], i64 8 -; CHECK-NEXT: [[TMP43:%.*]] = icmp ult ptr [[TMP42]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP43]], label %[[BB26]], label %[[TMP44]] -; CHECK: [[TMP44]]: -; CHECK-NEXT: [[TMP45]] = getelementptr i16, ptr [[PSEARCH]], i64 8 -; CHECK-NEXT: [[TMP46:%.*]] = icmp ult ptr [[TMP45]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP46]], label %[[BB21]], label %[[DOTLOOPEXIT1:.*]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: br label %[[BB47:.*]] -; CHECK: [[BB47]]: -; CHECK-NEXT: [[TMP48:%.*]] = phi ptr [ [[TMP58:%.*]], %[[TMP57:.*]] ], [ [[TMP0]], %[[SCALAR_PH]] ] -; CHECK-NEXT: [[TMP49:%.*]] = load i16, ptr [[TMP48]], align 1 -; CHECK-NEXT: br label %[[BB53:.*]] -; CHECK: [[BB50:.*]]: -; CHECK-NEXT: [[TMP51:%.*]] = getelementptr inbounds i16, ptr [[TMP54:%.*]], i64 1 -; CHECK-NEXT: [[TMP52:%.*]] = icmp eq ptr [[TMP51]], [[TMP3]] -; CHECK-NEXT: br i1 [[TMP52]], label %[[TMP57]], label %[[BB53]] -; CHECK: [[BB53]]: -; CHECK-NEXT: [[TMP54]] = phi ptr [ [[TMP2]], %[[BB47]] ], [ [[TMP51]], %[[BB50]] ] -; CHECK-NEXT: [[TMP55:%.*]] = load i16, ptr [[TMP54]], align 1 -; CHECK-NEXT: [[TMP56:%.*]] = icmp eq i16 [[TMP49]], [[TMP55]] -; CHECK-NEXT: br i1 [[TMP56]], label %[[DOTLOOPEXIT]], label %[[BB50]] -; CHECK: [[TMP57]]: -; CHECK-NEXT: [[TMP58]] = getelementptr inbounds i16, ptr [[TMP48]], i64 1 -; CHECK-NEXT: [[TMP59:%.*]] = icmp eq ptr [[TMP58]], [[TMP1]] -; CHECK-NEXT: br i1 [[TMP59]], label %[[DOTLOOPEXIT1]], label %[[BB47]] -; CHECK: [[_LOOPEXIT:.*:]] -; CHECK-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP48]], %[[BB53]] ], [ [[TMP40]], %[[BB36]] ] -; CHECK-NEXT: br label %[[BB60]] -; CHECK: [[_LOOPEXIT1:.*:]] -; CHECK-NEXT: br label %[[BB60]] -; CHECK: [[BB60]]: -; CHECK-NEXT: [[TMP61:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; CHECK-NEXT: ret ptr [[TMP61]] +; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; CHECK: [[HEADER_PREHEADER]]: +; CHECK-NEXT: br label %[[MEM_CHECK:.*]] +; CHECK: [[MEM_CHECK]]: +; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64 +; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64 +; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64 +; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 8) +; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12 +; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12 +; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12 +; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12 +; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]] +; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]] +; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]] +; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]] +; CHECK: [[FIND_FIRST_VEC_HEADER]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]]) +; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and [[TMP0]], [[SEARCH_PRED]] +; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PSEARCH]], i32 1, [[SEARCH_MASKED]], zeroinitializer) +; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]] +; CHECK: [[MATCH_CHECK_VEC]]: +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]]) +; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and [[TMP0]], [[NEEDLE_PRED]] +; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv8i16.p0(ptr [[PNEEDLE]], i32 1, [[NEEDLE_MASKED]], zeroinitializer) +; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement [[NEEDLE_LOAD_VEC]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement poison, i16 [[NEEDLE0]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector [[NEEDLE0_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select [[NEEDLE_MASKED]], [[NEEDLE_LOAD_VEC]], [[NEEDLE0_SPLAT]] +; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.nxv8i16( [[NEEDLE_SPLAT]], i64 0) +; CHECK-NEXT: [[MATCH_PRED:%.*]] = call @llvm.experimental.vector.match.nxv8i16.v8i16( [[SEARCH_LOAD_VEC]], <8 x i16> [[NEEDLE_VEC]], [[SEARCH_MASKED]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv8i1( [[MATCH_PRED]]) +; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]] +; CHECK: [[CALCULATE_MATCH]]: +; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1( [[MATCH_VEC]], i1 true) +; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i16, ptr [[MATCH_START]], i64 [[MATCH_IDX]] +; CHECK-NEXT: br label %[[EXIT_LOOPEXIT:.*]] +; CHECK: [[NEEDLE_CHECK_VEC]]: +; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i16, ptr [[PNEEDLE]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]] +; CHECK: [[SEARCH_CHECK_VEC]]: +; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i16, ptr [[PSEARCH]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_LOOPEXIT1:.*]] +; CHECK: [[SCALAR_PREHEADER]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ] +; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1 +; CHECK-NEXT: br label %[[MATCH_CHECK:.*]] +; CHECK: [[NEEDLE_CHECK:.*]]: +; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1 +; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; CHECK: [[MATCH_CHECK]]: +; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1 +; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT]], label %[[NEEDLE_CHECK]] +; CHECK: [[SEARCH_CHECK]]: +; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1 +; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1]], label %[[HEADER]] +; CHECK: [[EXIT_LOOPEXIT]]: +; CHECK-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT_LOOPEXIT1]]: +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; CHECK-NEXT: ret ptr [[RES]] ; ; DISABLE-LABEL: define ptr @find_first_of_i16( -; DISABLE-SAME: ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], ptr [[TMP2:%.*]], ptr [[TMP3:%.*]]) #[[ATTR0]] { -; DISABLE-NEXT: [[TMP5:%.*]] = icmp eq ptr [[TMP0]], [[TMP1]] -; DISABLE-NEXT: [[TMP6:%.*]] = icmp eq ptr [[TMP2]], [[TMP3]] -; DISABLE-NEXT: [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]] -; DISABLE-NEXT: br i1 [[TMP7]], label %[[BB21:.*]], label %[[DOTPREHEADER:.*]] -; DISABLE: [[_PREHEADER:.*:]] -; DISABLE-NEXT: br label %[[BB8:.*]] -; DISABLE: [[BB8]]: -; DISABLE-NEXT: [[TMP9:%.*]] = phi ptr [ [[TMP19:%.*]], %[[TMP18:.*]] ], [ [[TMP0]], %[[DOTPREHEADER]] ] -; DISABLE-NEXT: [[TMP10:%.*]] = load i16, ptr [[TMP9]], align 1 -; DISABLE-NEXT: br label %[[BB14:.*]] -; DISABLE: [[BB11:.*]]: -; DISABLE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP15:%.*]], i64 1 -; DISABLE-NEXT: [[TMP13:%.*]] = icmp eq ptr [[TMP12]], [[TMP3]] -; DISABLE-NEXT: br i1 [[TMP13]], label %[[TMP18]], label %[[BB14]] -; DISABLE: [[BB14]]: -; DISABLE-NEXT: [[TMP15]] = phi ptr [ [[TMP2]], %[[BB8]] ], [ [[TMP12]], %[[BB11]] ] -; DISABLE-NEXT: [[TMP16:%.*]] = load i16, ptr [[TMP15]], align 1 -; DISABLE-NEXT: [[TMP17:%.*]] = icmp eq i16 [[TMP10]], [[TMP16]] -; DISABLE-NEXT: br i1 [[TMP17]], label %[[DOTLOOPEXIT:.*]], label %[[BB11]] -; DISABLE: [[TMP18]]: -; DISABLE-NEXT: [[TMP19]] = getelementptr inbounds i16, ptr [[TMP9]], i64 1 -; DISABLE-NEXT: [[TMP20:%.*]] = icmp eq ptr [[TMP19]], [[TMP1]] -; DISABLE-NEXT: br i1 [[TMP20]], label %[[DOTLOOPEXIT1:.*]], label %[[BB8]] -; DISABLE: [[_LOOPEXIT:.*:]] -; DISABLE-NEXT: [[DOTLCSSA:%.*]] = phi ptr [ [[TMP9]], %[[BB14]] ] -; DISABLE-NEXT: br label %[[BB21]] -; DISABLE: [[_LOOPEXIT1:.*:]] -; DISABLE-NEXT: br label %[[BB21]] -; DISABLE: [[BB21]]: -; DISABLE-NEXT: [[TMP22:%.*]] = phi ptr [ [[TMP1]], [[TMP4:%.*]] ], [ [[DOTLCSSA]], %[[DOTLOOPEXIT]] ], [ [[TMP1]], %[[DOTLOOPEXIT1]] ] -; DISABLE-NEXT: ret ptr [[TMP22]] +; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; DISABLE-NEXT: [[ENTRY:.*]]: +; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT:.*]], label %[[HEADER_PREHEADER:.*]] +; DISABLE: [[HEADER_PREHEADER]]: +; DISABLE-NEXT: br label %[[HEADER:.*]] +; DISABLE: [[HEADER]]: +; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ] +; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i16, ptr [[SEARCH_PTR]], align 1 +; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]] +; DISABLE: [[NEEDLE_CHECK:.*]]: +; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i16, ptr [[NEEDLE_PTR:%.*]], i64 1 +; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; DISABLE: [[MATCH_CHECK]]: +; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i16, ptr [[NEEDLE_PTR]], align 1 +; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i16 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_LOOPEXIT:.*]], label %[[NEEDLE_CHECK]] +; DISABLE: [[SEARCH_CHECK]]: +; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i16, ptr [[SEARCH_PTR]], i64 1 +; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_LOOPEXIT1:.*]], label %[[HEADER]] +; DISABLE: [[EXIT_LOOPEXIT]]: +; DISABLE-NEXT: [[SEARCH_PTR_LCSSA:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ] +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT_LOOPEXIT1]]: +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT]]: +; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_PTR_LCSSA]], %[[EXIT_LOOPEXIT]] ], [ [[SEARCH_END]], %[[EXIT_LOOPEXIT1]] ] +; DISABLE-NEXT: ret ptr [[RES]] ; - %5 = icmp eq ptr %0, %1 - %6 = icmp eq ptr %2, %3 - %7 = or i1 %5, %6 - br i1 %7, label %21, label %8 - -8: - %9 = phi ptr [ %19, %18 ], [ %0, %4 ] - %10 = load i16, ptr %9, align 1 - br label %14 - -11: - %12 = getelementptr inbounds i16, ptr %15, i64 1 - %13 = icmp eq ptr %12, %3 - br i1 %13, label %18, label %14 - -14: - %15 = phi ptr [ %2, %8 ], [ %12, %11 ] - %16 = load i16, ptr %15, align 1 - %17 = icmp eq i16 %10, %16 - br i1 %17, label %21, label %11 - -18: - %19 = getelementptr inbounds i16, ptr %9, i64 1 - %20 = icmp eq ptr %19, %1 - br i1 %20, label %21, label %8 - -21: - %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] - ret ptr %22 +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i16, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i16, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i16, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i16 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i16, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res } ; From here on we only test for the presence/absence of the intrinsic. @@ -320,126 +325,128 @@ define ptr @find_first_of_i16(ptr %0, ptr %1, ptr %2, ptr %3) #0 { ; Same as @find_first_of_i8 but with `ne' comparison. ; This is rejected for now, but should eventually be supported. -define ptr @find_first_not_of_i8(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +define ptr @find_first_not_of_i8(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { ; CHECK-LABEL: define ptr @find_first_not_of_i8( ; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; ; DISABLE-LABEL: define ptr @find_first_not_of_i8( ; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; - %5 = icmp eq ptr %0, %1 - %6 = icmp eq ptr %2, %3 - %7 = or i1 %5, %6 - br i1 %7, label %21, label %8 - -8: - %9 = phi ptr [ %19, %18 ], [ %0, %4 ] - %10 = load i8, ptr %9, align 1 - br label %14 - -11: - %12 = getelementptr inbounds i8, ptr %15, i64 1 - %13 = icmp eq ptr %12, %3 - br i1 %13, label %18, label %14 - -14: - %15 = phi ptr [ %2, %8 ], [ %12, %11 ] - %16 = load i8, ptr %15, align 1 - %17 = icmp ne i8 %10, %16 - br i1 %17, label %21, label %11 - -18: - %19 = getelementptr inbounds i8, ptr %9, i64 1 - %20 = icmp eq ptr %19, %1 - br i1 %20, label %21, label %8 - -21: - %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] - ret ptr %22 +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp ne i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res } ; This is the same as @find_first_of_i8 but without SVE2, which we require to ; perform the conversion. -define ptr @find_first_of_i8_nosve2(ptr %0, ptr %1, ptr %2, ptr %3) { +define ptr @find_first_of_i8_nosve2(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) { ; CHECK-LABEL: define ptr @find_first_of_i8_nosve2( ; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; ; DISABLE-LABEL: define ptr @find_first_of_i8_nosve2( ; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; - %5 = icmp eq ptr %0, %1 - %6 = icmp eq ptr %2, %3 - %7 = or i1 %5, %6 - br i1 %7, label %21, label %8 - -8: - %9 = phi ptr [ %19, %18 ], [ %0, %4 ] - %10 = load i8, ptr %9, align 1 - br label %14 - -11: - %12 = getelementptr inbounds i8, ptr %15, i64 1 - %13 = icmp eq ptr %12, %3 - br i1 %13, label %18, label %14 - -14: - %15 = phi ptr [ %2, %8 ], [ %12, %11 ] - %16 = load i8, ptr %15, align 1 - %17 = icmp eq i8 %10, %16 - br i1 %17, label %21, label %11 - -18: - %19 = getelementptr inbounds i8, ptr %9, i64 1 - %20 = icmp eq ptr %19, %1 - br i1 %20, label %21, label %8 - -21: - %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] - ret ptr %22 +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + ret ptr %res } ; Same as @find_first_of_i8 but here we use the inner PHI outside the loop nest. ; This isn't supported. -define ptr @find_first_of_i8_outside_use(ptr %0, ptr %1, ptr %2, ptr %3) #0 { +define ptr @find_first_of_i8_outside_use(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { ; CHECK-LABEL: define ptr @find_first_of_i8_outside_use( ; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; ; DISABLE-LABEL: define ptr @find_first_of_i8_outside_use( ; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} ; - %5 = icmp eq ptr %0, %1 - %6 = icmp eq ptr %2, %3 - %7 = or i1 %5, %6 - br i1 %7, label %21, label %8 - -8: - %9 = phi ptr [ %19, %18 ], [ %0, %4 ] - %10 = load i8, ptr %9, align 1 - br label %14 - -11: - %12 = getelementptr inbounds i8, ptr %15, i64 1 - %13 = icmp eq ptr %12, %3 - br i1 %13, label %18, label %14 - -14: - %15 = phi ptr [ %2, %8 ], [ %12, %11 ] - %16 = load i8, ptr %15, align 1 - %17 = icmp ne i8 %10, %16 - br i1 %17, label %21, label %11 - -18: - %19 = getelementptr inbounds i8, ptr %9, i64 1 - %20 = icmp eq ptr %19, %1 - br i1 %20, label %21, label %8 - -21: - %22 = phi ptr [ %1, %4 ], [ %9, %14 ], [ %1, %18 ] - %23 = phi ptr [ %3, %4 ], [ %15, %14 ], [ %3, %18 ] - ret ptr %23 +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit, label %header + +exit: + %res = phi ptr [ %search_end, %entry ], [ %search_ptr, %match_check ], [ %search_end, %search_check ] + %use = phi ptr [ %needle_end, %entry ], [ %needle_ptr, %match_check ], [ %needle_end, %search_check ] + ret ptr %res } attributes #0 = { "target-features"="+sve2" } -;. + ; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90} -;. From db2fbc69febbb073ab787385b78936c43c313105 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Tue, 4 Feb 2025 13:56:25 +0000 Subject: [PATCH 6/7] Add checks for loop invariance --- llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 44fe5ba3a0bfd..38eedf92d8c24 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -1158,6 +1158,12 @@ bool LoopIdiomVectorize::recognizeFindFirstByte() { m_BasicBlock(ExitFail), m_Specific(Header)))) return false; + if (!CurLoop->isLoopInvariant(SearchStart) || + !CurLoop->isLoopInvariant(SearchEnd) || + !CurLoop->isLoopInvariant(NeedleStart) || + !CurLoop->isLoopInvariant(NeedleEnd)) + return false; + LLVM_DEBUG(dbgs() << "Found idiom in loop: \n" << *CurLoop << "\n\n"); transformFindFirstByte(IndPhi, VF, CharTy, ExitSucc, ExitFail, SearchStart, From 6d1755189549f3f6dd804d98810cbc374cd087a8 Mon Sep 17 00:00:00 2001 From: Ricardo Jesus Date: Thu, 6 Feb 2025 13:02:59 +0000 Subject: [PATCH 7/7] Add tests for different success/failure exits --- .../Vectorize/LoopIdiomVectorize.cpp | 2 +- .../LoopIdiom/AArch64/find-first-byte.ll | 219 ++++++++++++++++++ 2 files changed, 220 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp index 38eedf92d8c24..90329200dd7e4 100644 --- a/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopIdiomVectorize.cpp @@ -60,7 +60,7 @@ // // NOTE: This Pass matches really specific loop patterns because it's only // supposed to be a temporary solution until our LoopVectorizer is powerful -// enought to vectorize them automatically. +// enough to vectorize them automatically. // //===----------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll index 92cde4d27f2c0..8ef2a51506606 100644 --- a/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll +++ b/llvm/test/Transforms/LoopIdiom/AArch64/find-first-byte.ll @@ -320,6 +320,176 @@ exit: ret ptr %res } +; Same as @find_first_of_i8 but with two intermediate exit blocks for the +; "success" (exit_succ) and "failure" (exit_fail) paths. +define ptr @find_first_of_i8_multi_exit(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8_multi_exit( +; CHECK-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; CHECK-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; CHECK-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; CHECK-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT_FAIL:.*]], label %[[HEADER_PREHEADER:.*]] +; CHECK: [[HEADER_PREHEADER]]: +; CHECK-NEXT: br label %[[MEM_CHECK:.*]] +; CHECK: [[MEM_CHECK]]: +; CHECK-NEXT: [[SEARCH_START_INT:%.*]] = ptrtoint ptr [[SEARCH_START]] to i64 +; CHECK-NEXT: [[SEARCH_END_INT:%.*]] = ptrtoint ptr [[SEARCH_END]] to i64 +; CHECK-NEXT: [[NEEDLE_START_INT:%.*]] = ptrtoint ptr [[NEEDLE_START]] to i64 +; CHECK-NEXT: [[NEEDLE_END_INT:%.*]] = ptrtoint ptr [[NEEDLE_END]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 16) +; CHECK-NEXT: [[SEARCH_START_PAGE:%.*]] = lshr i64 [[SEARCH_START_INT]], 12 +; CHECK-NEXT: [[SEARCH_END_PAGE:%.*]] = lshr i64 [[SEARCH_END_INT]], 12 +; CHECK-NEXT: [[NEEDLE_START_PAGE:%.*]] = lshr i64 [[NEEDLE_START_INT]], 12 +; CHECK-NEXT: [[NEEDLE_END_PAGE:%.*]] = lshr i64 [[NEEDLE_END_INT]], 12 +; CHECK-NEXT: [[SEARCH_PAGE_CMP:%.*]] = icmp ne i64 [[SEARCH_START_PAGE]], [[SEARCH_END_PAGE]] +; CHECK-NEXT: [[NEEDLE_PAGE_CMP:%.*]] = icmp ne i64 [[NEEDLE_START_PAGE]], [[NEEDLE_END_PAGE]] +; CHECK-NEXT: [[COMBINED_PAGE_CMP:%.*]] = or i1 [[SEARCH_PAGE_CMP]], [[NEEDLE_PAGE_CMP]] +; CHECK-NEXT: br i1 [[COMBINED_PAGE_CMP]], label %[[SCALAR_PREHEADER:.*]], label %[[FIND_FIRST_VEC_HEADER:.*]], !prof [[PROF0]] +; CHECK: [[FIND_FIRST_VEC_HEADER]]: +; CHECK-NEXT: [[PSEARCH:%.*]] = phi ptr [ [[SEARCH_START]], %[[MEM_CHECK]] ], [ [[SEARCH_NEXT_VEC:%.*]], %[[SEARCH_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PSEARCH]] to i64 +; CHECK-NEXT: [[SEARCH_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP1]], i64 [[SEARCH_END_INT]]) +; CHECK-NEXT: [[SEARCH_MASKED:%.*]] = and [[TMP0]], [[SEARCH_PRED]] +; CHECK-NEXT: [[SEARCH_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PSEARCH]], i32 1, [[SEARCH_MASKED]], zeroinitializer) +; CHECK-NEXT: br label %[[MATCH_CHECK_VEC:.*]] +; CHECK: [[MATCH_CHECK_VEC]]: +; CHECK-NEXT: [[PNEEDLE:%.*]] = phi ptr [ [[NEEDLE_START]], %[[FIND_FIRST_VEC_HEADER]] ], [ [[NEEDLE_NEXT_VEC:%.*]], %[[NEEDLE_CHECK_VEC:.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint ptr [[PNEEDLE]] to i64 +; CHECK-NEXT: [[NEEDLE_PRED:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[TMP2]], i64 [[NEEDLE_END_INT]]) +; CHECK-NEXT: [[NEEDLE_MASKED:%.*]] = and [[TMP0]], [[NEEDLE_PRED]] +; CHECK-NEXT: [[NEEDLE_LOAD_VEC:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr [[PNEEDLE]], i32 1, [[NEEDLE_MASKED]], zeroinitializer) +; CHECK-NEXT: [[NEEDLE0:%.*]] = extractelement [[NEEDLE_LOAD_VEC]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLATINSERT:%.*]] = insertelement poison, i8 [[NEEDLE0]], i64 0 +; CHECK-NEXT: [[NEEDLE0_SPLAT:%.*]] = shufflevector [[NEEDLE0_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[NEEDLE_SPLAT:%.*]] = select [[NEEDLE_MASKED]], [[NEEDLE_LOAD_VEC]], [[NEEDLE0_SPLAT]] +; CHECK-NEXT: [[NEEDLE_VEC:%.*]] = call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8( [[NEEDLE_SPLAT]], i64 0) +; CHECK-NEXT: [[MATCH_PRED:%.*]] = call @llvm.experimental.vector.match.nxv16i8.v16i8( [[SEARCH_LOAD_VEC]], <16 x i8> [[NEEDLE_VEC]], [[SEARCH_MASKED]]) +; CHECK-NEXT: [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.nxv16i1( [[MATCH_PRED]]) +; CHECK-NEXT: br i1 [[TMP3]], label %[[CALCULATE_MATCH:.*]], label %[[NEEDLE_CHECK_VEC]] +; CHECK: [[CALCULATE_MATCH]]: +; CHECK-NEXT: [[MATCH_START:%.*]] = phi ptr [ [[PSEARCH]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_VEC:%.*]] = phi [ [[MATCH_PRED]], %[[MATCH_CHECK_VEC]] ] +; CHECK-NEXT: [[MATCH_IDX:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1( [[MATCH_VEC]], i1 true) +; CHECK-NEXT: [[MATCH_RES:%.*]] = getelementptr i8, ptr [[MATCH_START]], i64 [[MATCH_IDX]] +; CHECK-NEXT: br label %[[EXIT_SUCC:.*]] +; CHECK: [[NEEDLE_CHECK_VEC]]: +; CHECK-NEXT: [[NEEDLE_NEXT_VEC]] = getelementptr i8, ptr [[PNEEDLE]], i64 16 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult ptr [[NEEDLE_NEXT_VEC]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[TMP4]], label %[[MATCH_CHECK_VEC]], label %[[SEARCH_CHECK_VEC]] +; CHECK: [[SEARCH_CHECK_VEC]]: +; CHECK-NEXT: [[SEARCH_NEXT_VEC]] = getelementptr i8, ptr [[PSEARCH]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = icmp ult ptr [[SEARCH_NEXT_VEC]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[FIND_FIRST_VEC_HEADER]], label %[[EXIT_FAIL_LOOPEXIT:.*]] +; CHECK: [[SCALAR_PREHEADER]]: +; CHECK-NEXT: br label %[[HEADER:.*]] +; CHECK: [[HEADER]]: +; CHECK-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[SCALAR_PREHEADER]] ] +; CHECK-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; CHECK-NEXT: br label %[[MATCH_CHECK:.*]] +; CHECK: [[NEEDLE_CHECK:.*]]: +; CHECK-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; CHECK-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; CHECK-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; CHECK: [[MATCH_CHECK]]: +; CHECK-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; CHECK-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; CHECK-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; CHECK-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_SUCC]], label %[[NEEDLE_CHECK]] +; CHECK: [[SEARCH_CHECK]]: +; CHECK-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; CHECK-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; CHECK-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_FAIL_LOOPEXIT]], label %[[HEADER]] +; CHECK: [[EXIT_SUCC]]: +; CHECK-NEXT: [[RES_SUCC:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ], [ [[MATCH_RES]], %[[CALCULATE_MATCH]] ] +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT_FAIL_LOOPEXIT]]: +; CHECK-NEXT: br label %[[EXIT_FAIL]] +; CHECK: [[EXIT_FAIL]]: +; CHECK-NEXT: [[RES_FAIL:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_END]], %[[EXIT_FAIL_LOOPEXIT]] ] +; CHECK-NEXT: br label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RES:%.*]] = phi ptr [ [[RES_SUCC]], %[[EXIT_SUCC]] ], [ [[RES_FAIL]], %[[EXIT_FAIL]] ] +; CHECK-NEXT: ret ptr [[RES]] +; +; DISABLE-LABEL: define ptr @find_first_of_i8_multi_exit( +; DISABLE-SAME: ptr [[SEARCH_START:%.*]], ptr [[SEARCH_END:%.*]], ptr [[NEEDLE_START:%.*]], ptr [[NEEDLE_END:%.*]]) #[[ATTR0]] { +; DISABLE-NEXT: [[ENTRY:.*]]: +; DISABLE-NEXT: [[SEARCH_TEST:%.*]] = icmp eq ptr [[SEARCH_START]], [[SEARCH_END]] +; DISABLE-NEXT: [[NEEDLE_TEST:%.*]] = icmp eq ptr [[NEEDLE_START]], [[NEEDLE_END]] +; DISABLE-NEXT: [[COMBINED_TEST:%.*]] = or i1 [[SEARCH_TEST]], [[NEEDLE_TEST]] +; DISABLE-NEXT: br i1 [[COMBINED_TEST]], label %[[EXIT_FAIL:.*]], label %[[HEADER_PREHEADER:.*]] +; DISABLE: [[HEADER_PREHEADER]]: +; DISABLE-NEXT: br label %[[HEADER:.*]] +; DISABLE: [[HEADER]]: +; DISABLE-NEXT: [[SEARCH_PTR:%.*]] = phi ptr [ [[SEARCH_NEXT:%.*]], %[[SEARCH_CHECK:.*]] ], [ [[SEARCH_START]], %[[HEADER_PREHEADER]] ] +; DISABLE-NEXT: [[SEARCH_LOAD:%.*]] = load i8, ptr [[SEARCH_PTR]], align 1 +; DISABLE-NEXT: br label %[[MATCH_CHECK:.*]] +; DISABLE: [[NEEDLE_CHECK:.*]]: +; DISABLE-NEXT: [[NEEDLE_NEXT:%.*]] = getelementptr inbounds i8, ptr [[NEEDLE_PTR:%.*]], i64 1 +; DISABLE-NEXT: [[NEEDLE_CMP:%.*]] = icmp eq ptr [[NEEDLE_NEXT]], [[NEEDLE_END]] +; DISABLE-NEXT: br i1 [[NEEDLE_CMP]], label %[[SEARCH_CHECK]], label %[[MATCH_CHECK]] +; DISABLE: [[MATCH_CHECK]]: +; DISABLE-NEXT: [[NEEDLE_PTR]] = phi ptr [ [[NEEDLE_START]], %[[HEADER]] ], [ [[NEEDLE_NEXT]], %[[NEEDLE_CHECK]] ] +; DISABLE-NEXT: [[NEEDLE_LOAD:%.*]] = load i8, ptr [[NEEDLE_PTR]], align 1 +; DISABLE-NEXT: [[MATCH_CMP:%.*]] = icmp eq i8 [[SEARCH_LOAD]], [[NEEDLE_LOAD]] +; DISABLE-NEXT: br i1 [[MATCH_CMP]], label %[[EXIT_SUCC:.*]], label %[[NEEDLE_CHECK]] +; DISABLE: [[SEARCH_CHECK]]: +; DISABLE-NEXT: [[SEARCH_NEXT]] = getelementptr inbounds i8, ptr [[SEARCH_PTR]], i64 1 +; DISABLE-NEXT: [[SEARCH_CMP:%.*]] = icmp eq ptr [[SEARCH_NEXT]], [[SEARCH_END]] +; DISABLE-NEXT: br i1 [[SEARCH_CMP]], label %[[EXIT_FAIL_LOOPEXIT:.*]], label %[[HEADER]] +; DISABLE: [[EXIT_SUCC]]: +; DISABLE-NEXT: [[RES_SUCC:%.*]] = phi ptr [ [[SEARCH_PTR]], %[[MATCH_CHECK]] ] +; DISABLE-NEXT: br label %[[EXIT:.*]] +; DISABLE: [[EXIT_FAIL_LOOPEXIT]]: +; DISABLE-NEXT: br label %[[EXIT_FAIL]] +; DISABLE: [[EXIT_FAIL]]: +; DISABLE-NEXT: [[RES_FAIL:%.*]] = phi ptr [ [[SEARCH_END]], %[[ENTRY]] ], [ [[SEARCH_END]], %[[EXIT_FAIL_LOOPEXIT]] ] +; DISABLE-NEXT: br label %[[EXIT]] +; DISABLE: [[EXIT]]: +; DISABLE-NEXT: [[RES:%.*]] = phi ptr [ [[RES_SUCC]], %[[EXIT_SUCC]] ], [ [[RES_FAIL]], %[[EXIT_FAIL]] ] +; DISABLE-NEXT: ret ptr [[RES]] +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit_fail, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit_succ, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit_fail, label %header + +exit_succ: + %res_succ = phi ptr [ %search_ptr, %match_check ] + br label %exit + +exit_fail: + %res_fail = phi ptr [ %search_end, %entry ], [ %search_end, %search_check ] + br label %exit + +exit: + %res = phi ptr [ %res_succ, %exit_succ ], [ %res_fail, %exit_fail ] + ret ptr %res +} + ; From here on we only test for the presence/absence of the intrinsic. ; UTC_ARGS: --disable @@ -447,6 +617,55 @@ exit: ret ptr %res } +; Same as @find_first_of_i8_multi_exit but `search_ptr' is used in `exit_fail' +; which should block the transform. +define ptr @find_first_of_i8_multi_exit_outside_use(ptr %search_start, ptr %search_end, ptr %needle_start, ptr %needle_end) #0 { +; CHECK-LABEL: define ptr @find_first_of_i8_multi_exit_outside_use( +; CHECK-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +; DISABLE-LABEL: define ptr @find_first_of_i8_multi_exit_outside_use( +; DISABLE-NOT: {{%.*}} @llvm.experimental.vector.match{{.*}} +; +entry: + %search_test = icmp eq ptr %search_start, %search_end + %needle_test = icmp eq ptr %needle_start, %needle_end + %combined_test = or i1 %search_test, %needle_test + br i1 %combined_test, label %exit_fail, label %header + +header: + %search_ptr = phi ptr [ %search_next, %search_check ], [ %search_start, %entry ] + %search_load = load i8, ptr %search_ptr, align 1 + br label %match_check + +needle_check: + %needle_next = getelementptr inbounds i8, ptr %needle_ptr, i64 1 + %needle_cmp = icmp eq ptr %needle_next, %needle_end + br i1 %needle_cmp, label %search_check, label %match_check + +match_check: + %needle_ptr = phi ptr [ %needle_start, %header ], [ %needle_next, %needle_check ] + %needle_load = load i8, ptr %needle_ptr, align 1 + %match_cmp = icmp eq i8 %search_load, %needle_load + br i1 %match_cmp, label %exit_succ, label %needle_check + +search_check: + %search_next = getelementptr inbounds i8, ptr %search_ptr, i64 1 + %search_cmp = icmp eq ptr %search_next, %search_end + br i1 %search_cmp, label %exit_fail, label %header + +exit_succ: + %res_succ = phi ptr [ %search_ptr, %match_check ] + br label %exit + +exit_fail: + %res_fail = phi ptr [ %search_end, %entry ], [ %search_ptr, %search_check ] + br label %exit + +exit: + %res = phi ptr [ %res_succ, %exit_succ ], [ %res_fail, %exit_fail ] + ret ptr %res +} + attributes #0 = { "target-features"="+sve2" } ; CHECK: [[PROF0]] = !{!"branch_weights", i32 10, i32 90}