diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp index 74f93e1979532..c6f7f850c29fb 100644 --- a/llvm/lib/CodeGen/ExpandMemCmp.cpp +++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp @@ -878,9 +878,22 @@ static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI, NumMemCmpInlined++; if (Value *Res = Expansion.getMemCmpExpansion()) { + auto *GV = dyn_cast(CI->getArgOperand(1)); // Replace call with result of expansion and erase call. CI->replaceAllUsesWith(Res); CI->eraseFromParent(); + + // If the memcmp call used a global constant to merge comparisons and + // the global constant was folded then the variable can be deleted + // since it isn't used anymore. + // This is mostly done when mergeicmps used a global constant to merge + // constant comparisons. + if (GV && GV->hasPrivateLinkage() && GV->isConstant() && + !GV->isConstantUsed()) { + LLVM_DEBUG(dbgs() << "Removing global constant " << GV->getName() + << " that was used by the dead memcmp() call\n"); + GV->eraseFromParent(); + } } return true; diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp index 4291f3aee0cd1..0167fdddf7f7f 100644 --- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp +++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp @@ -43,6 +43,7 @@ #include "llvm/Transforms/Scalar/MergeICmps.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/UniqueVector.h" #include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/Loads.h" @@ -50,8 +51,10 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Instruction.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/ValueMap.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Transforms/Scalar.h" @@ -129,10 +132,14 @@ class BaseIdentifier { DenseMap BaseToIndex; }; +// All Instructions related to a comparison. +typedef SmallDenseSet InstructionSet; + // If this value is a load from a constant offset w.r.t. a base address, and // there are no other users of the load or address, returns the base address and // the offset. -BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { +BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId, + InstructionSet *BlockInsts) { auto *const LoadI = dyn_cast(Val); if (!LoadI) return {}; @@ -171,43 +178,137 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) { if (!GEP->accumulateConstantOffset(DL, Offset)) return {}; Base = GEP->getPointerOperand(); + BlockInsts->insert(GEP); } + BlockInsts->insert(LoadI); return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset); } +// An abstract parent class that can either be a comparison of +// two BCEAtoms with the same offsets to a base pointer (BCECmp) +// or a comparison of a single BCEAtom with a constant (BCEConstCmp). +struct Comparison { +public: + enum CompKind { + CK_ConstCmp, + CK_BceCmp, + }; + +private: + const CompKind Kind; + +public: + int SizeBits; + const ICmpInst *CmpI; + + Comparison(CompKind K, int SizeBits, const ICmpInst *CmpI) + : Kind(K), SizeBits(SizeBits), CmpI(CmpI) {} + CompKind getKind() const { return Kind; } + + virtual ~Comparison() = default; + bool areContiguous(const Comparison &Other) const; + bool operator<(const Comparison &Other) const; +}; + +// A comparison between a BCE atom and an integer constant. +// If these BCE atoms are chained and access adjacent memory then they too can +// be merged, e.g. +// ``` +// int *p = ...; +// int a = p[0]; +// int b = p[1]; +// return a == 100 && b == 2; +// ``` +struct BCEConstCmp : public Comparison { + BCEAtom Lhs; + Constant *Const; + + BCEConstCmp(BCEAtom L, Constant *Const, int SizeBits, const ICmpInst *CmpI) + : Comparison(CK_ConstCmp, SizeBits, CmpI), Lhs(std::move(L)), + Const(Const) {} + static bool classof(const Comparison *C) { + return C->getKind() == CK_ConstCmp; + } +}; + // A comparison between two BCE atoms, e.g. `a == o.a` in the example at the // top. // Note: the terminology is misleading: the comparison is symmetric, so there // is no real {l/r}hs. What we want though is to have the same base on the // left (resp. right), so that we can detect consecutive loads. To ensure this // we put the smallest atom on the left. -struct BCECmp { +struct BCECmp : public Comparison { BCEAtom Lhs; BCEAtom Rhs; - int SizeBits; - const ICmpInst *CmpI; BCECmp(BCEAtom L, BCEAtom R, int SizeBits, const ICmpInst *CmpI) - : Lhs(std::move(L)), Rhs(std::move(R)), SizeBits(SizeBits), CmpI(CmpI) { + : Comparison(CK_BceCmp, SizeBits, CmpI), Lhs(std::move(L)), + Rhs(std::move(R)) { if (Rhs < Lhs) std::swap(Rhs, Lhs); } + static bool classof(const Comparison *C) { return C->getKind() == CK_BceCmp; } }; -// A basic block with a comparison between two BCE atoms. -// The block might do extra work besides the atom comparison, in which case -// doesOtherWork() returns true. Under some conditions, the block can be -// split into the atom comparison part and the "other work" part -// (see canSplit()). -class BCECmpBlock { - public: - typedef SmallDenseSet InstructionSet; +// TODO: this can be improved to take alignment into account. +bool Comparison::areContiguous(const Comparison &Other) const { + assert(isa(this) == isa(Other) && + "Comparisons are of same kind"); + if (isa(this)) { + const auto &First = cast(this); + const auto &Second = cast(Other); + + return First->Lhs.BaseId == Second.Lhs.BaseId && + First->Lhs.Offset + First->SizeBits / 8 == Second.Lhs.Offset; + } + const auto &First = cast(this); + const auto &Second = cast(Other); + + return First->Lhs.BaseId == Second.Lhs.BaseId && + First->Rhs.BaseId == Second.Rhs.BaseId && + First->Lhs.Offset + First->SizeBits / 8 == Second.Lhs.Offset && + First->Rhs.Offset + First->SizeBits / 8 == Second.Rhs.Offset; +} +bool Comparison::operator<(const Comparison &Other) const { + assert(isa(this) == isa(Other) && + "Comparisons are of same kind"); + if (isa(this)) { + const auto &First = cast(this); + const auto &Second = cast(Other); + return First->Lhs < Second.Lhs; + } + const auto &First = cast(this); + const auto &Second = cast(Other); + return std::tie(First->Lhs, First->Rhs) < std::tie(Second.Lhs, Second.Rhs); +} + +// Represents multiple comparisons inside of a single basic block. +// This happens if multiple basic blocks have previously been merged into a +// single block using a select node. +class IntraCmpChain { + // TODO: this could probably be a unique-ptr but current impl relies on some + // copies + std::vector> CmpChain; + +public: + IntraCmpChain(std::shared_ptr C) : CmpChain{C} {} + IntraCmpChain combine(const IntraCmpChain OtherChain) { + CmpChain.insert(CmpChain.end(), OtherChain.CmpChain.begin(), + OtherChain.CmpChain.end()); + return *this; + } + std::vector> getCmpChain() const { + return CmpChain; + } +}; - BCECmpBlock(BCECmp Cmp, BasicBlock *BB, InstructionSet BlockInsts) - : BB(BB), BlockInsts(std::move(BlockInsts)), Cmp(std::move(Cmp)) {} +// A basic block that contains one or more comparisons. +class MultBCECmpBlock { +public: + MultBCECmpBlock(std::vector> Cmps, BasicBlock *BB, + InstructionSet BlockInsts) + : BB(BB), BlockInsts(std::move(BlockInsts)), Cmps(std::move(Cmps)) {} - const BCEAtom &Lhs() const { return Cmp.Lhs; } - const BCEAtom &Rhs() const { return Cmp.Rhs; } - int SizeBits() const { return Cmp.SizeBits; } + std::vector> getCmps() { return Cmps; } // Returns true if the block does other works besides comparison. bool doesOtherWork() const; @@ -216,12 +317,50 @@ class BCECmpBlock { // instructions in the block. bool canSplit(AliasAnalysis &AA) const; - // Return true if this all the relevant instructions in the BCE-cmp-block can + // Return true if all the relevant instructions in the BCE-cmp-block can // be sunk below this instruction. By doing this, we know we can separate the // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the // block. bool canSinkBCECmpInst(const Instruction *, AliasAnalysis &AA) const; + // Returns all instructions that should be split off of the comparison chain. + llvm::SmallVector getAllSplitInsts(AliasAnalysis &AA) const; + + // The basic block where this comparison happens. + BasicBlock *BB; + // Instructions relating to the BCECmp and branch. + InstructionSet BlockInsts; + +private: + std::vector> Cmps; +}; + +// A basic block with single a comparison between two BCE atoms. +// The block might do extra work besides the atom comparison, in which case +// doesOtherWork() returns true. Under some conditions, the block can be +// split into the atom comparison part and the "other work" part +// (see canSplit()). +class SingleBCECmpBlock { +public: + SingleBCECmpBlock(std::shared_ptr Cmp, BasicBlock *BB, + unsigned OrigOrder) + : BB(BB), OrigOrder(OrigOrder), Cmp(std::move(Cmp)) {} + + SingleBCECmpBlock(std::shared_ptr Cmp, BasicBlock *BB, + unsigned OrigOrder, + llvm::SmallVector SplitInsts) + : BB(BB), OrigOrder(OrigOrder), RequireSplit(true), Cmp(std::move(Cmp)), + SplitInsts(SplitInsts) {} + + const BCEAtom *Lhs() const { + if (auto *const BceConstCmp = dyn_cast(Cmp.get())) + return &BceConstCmp->Lhs; + auto *const BceCmp = cast(Cmp.get()); + return &BceCmp->Lhs; + } + const Comparison *getCmp() const { return Cmp.get(); } + bool operator<(const SingleBCECmpBlock &O) const { return *Cmp < *O.Cmp; } + // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block // instructions. Split the old block and move all non-BCE-cmp-insts into the // new parent block. @@ -229,19 +368,18 @@ class BCECmpBlock { // The basic block where this comparison happens. BasicBlock *BB; - // Instructions relating to the BCECmp and branch. - InstructionSet BlockInsts; - // The block requires splitting. - bool RequireSplit = false; // Original order of this block in the chain. unsigned OrigOrder = 0; + // The block requires splitting. + bool RequireSplit = false; private: - BCECmp Cmp; + std::shared_ptr Cmp; + llvm::SmallVector SplitInsts; }; -bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, - AliasAnalysis &AA) const { +bool MultBCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, + AliasAnalysis &AA) const { // If this instruction may clobber the loads and is in middle of the BCE cmp // block instructions, then bail for now. if (Inst->mayWriteToMemory()) { @@ -251,7 +389,13 @@ bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, return (Inst->getParent() != LI->getParent() || !Inst->comesBefore(LI)) && isModSet(AA.getModRefInfo(Inst, MemoryLocation::get(LI))); }; - if (MayClobber(Cmp.Lhs.LoadI) || MayClobber(Cmp.Rhs.LoadI)) + auto CmpLoadsAreClobbered = [&](const auto &Cmp) { + if (auto *const BceConstCmp = dyn_cast(Cmp.get())) + return MayClobber(BceConstCmp->Lhs.LoadI); + auto *const BceCmp = cast(Cmp.get()); + return MayClobber(BceCmp->Lhs.LoadI) || MayClobber(BceCmp->Rhs.LoadI); + }; + if (llvm::any_of(Cmps, CmpLoadsAreClobbered)) return false; } // Make sure this instruction does not use any of the BCE cmp block @@ -262,23 +406,13 @@ bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst, }); } -void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const { - llvm::SmallVector OtherInsts; - for (Instruction &Inst : *BB) { - if (BlockInsts.count(&Inst)) - continue; - assert(canSinkBCECmpInst(&Inst, AA) && "Split unsplittable block"); - // This is a non-BCE-cmp-block instruction. And it can be separated - // from the BCE-cmp-block instruction. - OtherInsts.push_back(&Inst); - } - - // Do the actual spliting. - for (Instruction *Inst : reverse(OtherInsts)) +void SingleBCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const { + // Do the actual splitting. + for (Instruction *Inst : reverse(SplitInsts)) Inst->moveBeforePreserving(*NewParent, NewParent->begin()); } -bool BCECmpBlock::canSplit(AliasAnalysis &AA) const { +bool MultBCECmpBlock::canSplit(AliasAnalysis &AA) const { for (Instruction &Inst : *BB) { if (!BlockInsts.count(&Inst)) { if (!canSinkBCECmpInst(&Inst, AA)) @@ -288,7 +422,7 @@ bool BCECmpBlock::canSplit(AliasAnalysis &AA) const { return true; } -bool BCECmpBlock::doesOtherWork() const { +bool MultBCECmpBlock::doesOtherWork() const { // TODO(courbet): Can we allow some other things ? This is very conservative. // We might be able to get away with anything does not have any side // effects outside of the basic block. @@ -300,11 +434,26 @@ bool BCECmpBlock::doesOtherWork() const { return false; } +llvm::SmallVector +MultBCECmpBlock::getAllSplitInsts(AliasAnalysis &AA) const { + llvm::SmallVector SplitInsts; + for (Instruction &Inst : *BB) { + if (BlockInsts.count(&Inst)) + continue; + assert(canSinkBCECmpInst(&Inst, AA) && "Split unsplittable block"); + // This is a non-BCE-cmp-block instruction. And it can be separated + // from the BCE-cmp-block instructions. + SplitInsts.push_back(&Inst); + } + return SplitInsts; +} + // Visit the given comparison. If this is a comparison between two valid -// BCE atoms, returns the comparison. -std::optional visitICmp(const ICmpInst *const CmpI, - const ICmpInst::Predicate ExpectedPredicate, - BaseIdentifier &BaseId) { +// BCE atoms, or between a BCE atom and a constant, returns the comparison. +std::optional> +visitICmp(const ICmpInst *const CmpI, + const ICmpInst::Predicate ExpectedPredicate, BaseIdentifier &BaseId, + InstructionSet *BlockInsts) { // The comparison can only be used once: // - For intermediate blocks, as a branch condition. // - For the final block, as an incoming value for the Phi. @@ -319,23 +468,84 @@ std::optional visitICmp(const ICmpInst *const CmpI, LLVM_DEBUG(dbgs() << "cmp " << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne") << "\n"); - auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId); + // First operand is always a load + auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId, BlockInsts); if (!Lhs.BaseId) return std::nullopt; - auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId); + + // Second operand can either be load if doing compare between two BCE atoms or + // can be constant if comparing adjacent memory to constant + auto *RhsOperand = CmpI->getOperand(1); + const auto &DL = CmpI->getDataLayout(); + int SizeBits = DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()); + + BlockInsts->insert(CmpI); + if (auto const &Const = dyn_cast(RhsOperand)) + return std::make_shared( + BCEConstCmp(std::move(Lhs), Const, SizeBits, CmpI)); + + auto Rhs = visitICmpLoadOperand(RhsOperand, BaseId, BlockInsts); if (!Rhs.BaseId) return std::nullopt; - const auto &DL = CmpI->getDataLayout(); - return BCECmp(std::move(Lhs), std::move(Rhs), - DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()), CmpI); + return std::make_shared( + BCECmp(std::move(Lhs), std::move(Rhs), SizeBits, CmpI)); +} + +// Chain of comparisons inside a single basic block connected using `select` +// nodes. +std::optional visitComparison(Value *, ICmpInst::Predicate, + BaseIdentifier &, + InstructionSet *); + +std::optional visitSelect(const SelectInst *const SelectI, + ICmpInst::Predicate ExpectedPredicate, + BaseIdentifier &BaseId, + InstructionSet *BlockInsts) { + if (!SelectI->hasOneUse()) { + LLVM_DEBUG(dbgs() << "select has several uses\n"); + return std::nullopt; + } + auto *Cmp1 = dyn_cast(SelectI->getOperand(0)); + auto *Sel1 = dyn_cast(SelectI->getOperand(0)); + auto const &Cmp2 = dyn_cast(SelectI->getOperand(1)); + auto const &ConstantI = dyn_cast(SelectI->getOperand(2)); + + if (!(Cmp1 || Sel1) || !Cmp2 || !ConstantI || !ConstantI->isZeroValue()) + return std::nullopt; + + auto Lhs = visitComparison(SelectI->getOperand(0), ExpectedPredicate, BaseId, + BlockInsts); + if (!Lhs) + return std::nullopt; + auto Rhs = visitComparison(Cmp2, ExpectedPredicate, BaseId, BlockInsts); + if (!Rhs) + return std::nullopt; + + BlockInsts->insert(SelectI); + return Lhs->combine(std::move(*Rhs)); +} + +std::optional +visitComparison(Value *Cond, ICmpInst::Predicate ExpectedPredicate, + BaseIdentifier &BaseId, InstructionSet *BlockInsts) { + if (auto *CmpI = dyn_cast(Cond)) { + auto CmpVisit = visitICmp(CmpI, ExpectedPredicate, BaseId, BlockInsts); + if (!CmpVisit) + return std::nullopt; + return IntraCmpChain(*CmpVisit); + } + if (auto *SelectI = dyn_cast(Cond)) + return visitSelect(SelectI, ExpectedPredicate, BaseId, BlockInsts); + + return std::nullopt; } // Visit the given comparison block. If this is a comparison between two valid // BCE atoms, returns the comparison. -std::optional visitCmpBlock(Value *const Val, - BasicBlock *const Block, - const BasicBlock *const PhiBlock, - BaseIdentifier &BaseId) { +std::optional visitCmpBlock(Value *const Val, + BasicBlock *const Block, + const BasicBlock *const PhiBlock, + BaseIdentifier &BaseId) { if (Block->empty()) return std::nullopt; auto *const BranchI = dyn_cast(Block->getTerminator()); @@ -366,41 +576,56 @@ std::optional visitCmpBlock(Value *const Val, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE; } - auto *CmpI = dyn_cast(Cond); - if (!CmpI) - return std::nullopt; - LLVM_DEBUG(dbgs() << "icmp\n"); - - std::optional Result = visitICmp(CmpI, ExpectedPredicate, BaseId); + InstructionSet BlockInsts; + std::optional Result = + visitComparison(Cond, ExpectedPredicate, BaseId, &BlockInsts); if (!Result) return std::nullopt; - BCECmpBlock::InstructionSet BlockInsts( - {Result->Lhs.LoadI, Result->Rhs.LoadI, Result->CmpI, BranchI}); - if (Result->Lhs.GEP) - BlockInsts.insert(Result->Lhs.GEP); - if (Result->Rhs.GEP) - BlockInsts.insert(Result->Rhs.GEP); - return BCECmpBlock(std::move(*Result), Block, BlockInsts); + BlockInsts.insert(BranchI); + return MultBCECmpBlock(Result->getCmpChain(), Block, BlockInsts); } -static inline void enqueueBlock(std::vector &Comparisons, - BCECmpBlock &&Comparison) { - LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName() - << "': Found cmp of " << Comparison.SizeBits() - << " bits between " << Comparison.Lhs().BaseId << " + " - << Comparison.Lhs().Offset << " and " - << Comparison.Rhs().BaseId << " + " - << Comparison.Rhs().Offset << "\n"); - LLVM_DEBUG(dbgs() << "\n"); - Comparison.OrigOrder = Comparisons.size(); - Comparisons.push_back(std::move(Comparison)); +void emitDebugInfo(std::shared_ptr Cmp, BasicBlock *BB) { + LLVM_DEBUG(dbgs() << "Block '" << BB->getName()); + if (auto *ConstCmp = dyn_cast(Cmp.get())) { + LLVM_DEBUG(dbgs() << "': Found constant-cmp of " << Cmp->SizeBits + << " bits including " << ConstCmp->Lhs.BaseId << " + " + << ConstCmp->Lhs.Offset << "\n"); + return; + } + auto *BceCmp = cast(Cmp.get()); + LLVM_DEBUG(dbgs() << "': Found cmp of " << BceCmp->SizeBits + << " bits between " << BceCmp->Lhs.BaseId << " + " + << BceCmp->Lhs.Offset << " and " << BceCmp->Rhs.BaseId + << " + " << BceCmp->Rhs.Offset << "\n"); +} + +// Enqueues all comparisons of a mult-block. +// If the block requires splitting then adds `OtherInsts` to the block too. +static inline void +enqueueSingleCmps(std::vector &Comparisons, + MultBCECmpBlock &&CmpBlock, AliasAnalysis &AA, + bool RequireSplit) { + bool hasAlreadySplit = false; + for (auto &Cmp : CmpBlock.getCmps()) { + emitDebugInfo(Cmp, CmpBlock.BB); + unsigned OrigOrder = Comparisons.size(); + if (RequireSplit && !hasAlreadySplit) { + hasAlreadySplit = true; + auto SplitInsts = CmpBlock.getAllSplitInsts(AA); + Comparisons.push_back( + SingleBCECmpBlock(Cmp, CmpBlock.BB, OrigOrder, SplitInsts)); + continue; + } + Comparisons.push_back(SingleBCECmpBlock(Cmp, CmpBlock.BB, OrigOrder)); + } } // A chain of comparisons. class BCECmpChain { public: - using ContiguousBlocks = std::vector; + using ContiguousBlocks = std::vector; BCECmpChain(const std::vector &Blocks, PHINode &Phi, AliasAnalysis &AA); @@ -408,66 +633,83 @@ class BCECmpChain { bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, DomTreeUpdater &DTU); + bool multBlockOnlyPartiallyMerged(); + bool atLeastOneMerged() const { return any_of(MergedBlocks_, [](const auto &Blocks) { return Blocks.size() > 1; }); - } + }; private: PHINode &Phi_; // The list of all blocks in the chain, grouped by contiguity. + // First all BCE comparisons followed by all BCE-Const comparisons. std::vector MergedBlocks_; // The original entry block (before sorting); BasicBlock *EntryBlock_; }; -static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) { - return First.Lhs().BaseId == Second.Lhs().BaseId && - First.Rhs().BaseId == Second.Rhs().BaseId && - First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset && - First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset; +// Returns true if a merge in the chain depends on a basic block where not every +// comparison is merged. NOTE: This is pretty restrictive and could potentially +// be handled using an improved tradeoff heuristic. +bool BCECmpChain::multBlockOnlyPartiallyMerged() { + llvm::SmallDenseSet UnmergedBlocks, MergedBB; + + for (auto &Merged : MergedBlocks_) { + if (Merged.size() == 1) { + UnmergedBlocks.insert(Merged[0].BB); + continue; + } + for (auto &C : Merged) + MergedBB.insert(C.BB); + } + return llvm::any_of(MergedBB, [&](const BasicBlock *BB) { + return UnmergedBlocks.contains(BB); + }); } static unsigned getMinOrigOrder(const BCECmpChain::ContiguousBlocks &Blocks) { unsigned MinOrigOrder = std::numeric_limits::max(); - for (const BCECmpBlock &Block : Blocks) + for (const SingleBCECmpBlock &Block : Blocks) MinOrigOrder = std::min(MinOrigOrder, Block.OrigOrder); return MinOrigOrder; } -/// Given a chain of comparison blocks, groups the blocks into contiguous -/// ranges that can be merged together into a single comparison. -static std::vector -mergeBlocks(std::vector &&Blocks) { - std::vector MergedBlocks; - +/// Given a chain of comparison blocks (of the same kind), groups the blocks +/// into contiguous ranges that can be merged together into a single comparison. +template +static void +mergeBlocks(RandomIt First, RandomIt Last, + std::vector *MergedBlocks) { // Sort to detect continuous offsets. - llvm::sort(Blocks, - [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) { - return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) < - std::tie(RhsBlock.Lhs(), RhsBlock.Rhs()); - }); + llvm::sort( + First, Last, + [](const SingleBCECmpBlock &LhsBlock, const SingleBCECmpBlock &RhsBlock) { + return LhsBlock < RhsBlock; + }); BCECmpChain::ContiguousBlocks *LastMergedBlock = nullptr; - for (BCECmpBlock &Block : Blocks) { - if (!LastMergedBlock || !areContiguous(LastMergedBlock->back(), Block)) { - MergedBlocks.emplace_back(); - LastMergedBlock = &MergedBlocks.back(); + int Offset = MergedBlocks->size(); + for (auto &BlockIt = First; BlockIt != Last; ++BlockIt) { + if (!LastMergedBlock || + !LastMergedBlock->back().getCmp()->areContiguous(*BlockIt->getCmp())) { + MergedBlocks->emplace_back(); + LastMergedBlock = &MergedBlocks->back(); } else { - LLVM_DEBUG(dbgs() << "Merging block " << Block.BB->getName() << " into " - << LastMergedBlock->back().BB->getName() << "\n"); + LLVM_DEBUG(dbgs() << "Merging block " << BlockIt->BB->getName() + << " into " << LastMergedBlock->back().BB->getName() + << "\n"); } - LastMergedBlock->push_back(std::move(Block)); + LastMergedBlock->push_back(std::move(*BlockIt)); } // While we allow reordering for merging, do not reorder unmerged comparisons. // Doing so may introduce branch on poison. - llvm::sort(MergedBlocks, [](const BCECmpChain::ContiguousBlocks &LhsBlocks, - const BCECmpChain::ContiguousBlocks &RhsBlocks) { - return getMinOrigOrder(LhsBlocks) < getMinOrigOrder(RhsBlocks); - }); - - return MergedBlocks; + llvm::sort(MergedBlocks->begin() + Offset, MergedBlocks->end(), + [](const BCECmpChain::ContiguousBlocks &LhsBlocks, + const BCECmpChain::ContiguousBlocks &RhsBlocks) { + return getMinOrigOrder(LhsBlocks) < getMinOrigOrder(RhsBlocks); + }); } BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, @@ -475,18 +717,18 @@ BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, : Phi_(Phi) { assert(!Blocks.empty() && "a chain should have at least one block"); // Now look inside blocks to check for BCE comparisons. - std::vector Comparisons; + std::vector Comparisons; BaseIdentifier BaseId; for (BasicBlock *const Block : Blocks) { assert(Block && "invalid block"); - std::optional Comparison = visitCmpBlock( + std::optional CmpBlock = visitCmpBlock( Phi.getIncomingValueForBlock(Block), Block, Phi.getParent(), BaseId); - if (!Comparison) { + if (!CmpBlock) { LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n"); return; } - if (Comparison->doesOtherWork()) { - LLVM_DEBUG(dbgs() << "block '" << Comparison->BB->getName() + if (CmpBlock->doesOtherWork()) { + LLVM_DEBUG(dbgs() << "block '" << CmpBlock->BB->getName() << "' does extra work besides compare\n"); if (Comparisons.empty()) { // This is the initial block in the chain, in case this block does other @@ -502,15 +744,14 @@ BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, // and start anew. // // NOTE: we only handle blocks a with single predecessor for now. - if (Comparison->canSplit(AA)) { + if (CmpBlock->canSplit(AA)) { LLVM_DEBUG(dbgs() - << "Split initial block '" << Comparison->BB->getName() + << "Split initial block '" << CmpBlock->BB->getName() << "' that does extra work besides compare\n"); - Comparison->RequireSplit = true; - enqueueBlock(Comparisons, std::move(*Comparison)); + enqueueSingleCmps(Comparisons, std::move(*CmpBlock), AA, true); } else { LLVM_DEBUG(dbgs() - << "ignoring initial block '" << Comparison->BB->getName() + << "ignoring initial block '" << CmpBlock->BB->getName() << "' that does extra work besides compare\n"); } continue; @@ -540,7 +781,7 @@ BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, // We could still merge bb1 and bb2 though. return; } - enqueueBlock(Comparisons, std::move(*Comparison)); + enqueueSingleCmps(Comparisons, std::move(*CmpBlock), AA, false); } // It is possible we have no suitable comparison to merge. @@ -548,8 +789,26 @@ BCECmpChain::BCECmpChain(const std::vector &Blocks, PHINode &Phi, LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n"); return; } + EntryBlock_ = Comparisons[0].BB; - MergedBlocks_ = mergeBlocks(std::move(Comparisons)); + + auto isConstCmp = [](SingleBCECmpBlock &C) { + return isa(C.getCmp()); + }; + auto BceIt = + std::partition(Comparisons.begin(), Comparisons.end(), isConstCmp); + + // The chain that requires splitting should always be first. + // If no chain requires splitting then defaults to BCE-comparisons coming + // first. + if (std::any_of(Comparisons.begin(), BceIt, + [](const SingleBCECmpBlock &B) { return B.RequireSplit; })) { + mergeBlocks(Comparisons.begin(), BceIt, &MergedBlocks_); + mergeBlocks(BceIt, Comparisons.end(), &MergedBlocks_); + } else { + mergeBlocks(BceIt, Comparisons.end(), &MergedBlocks_); + mergeBlocks(Comparisons.begin(), BceIt, &MergedBlocks_); + } } namespace { @@ -561,20 +820,24 @@ class MergedBlockName { SmallString<16> Scratch; public: - explicit MergedBlockName(ArrayRef Comparisons) + explicit MergedBlockName(ArrayRef Comparisons) : Name(makeName(Comparisons)) {} const StringRef Name; private: - StringRef makeName(ArrayRef Comparisons) { + StringRef makeName(ArrayRef Comparisons) { assert(!Comparisons.empty() && "no basic block"); // Fast path: only one block, or no names at all. if (Comparisons.size() == 1) return Comparisons[0].BB->getName(); - const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0, - [](int i, const BCECmpBlock &Cmp) { - return i + Cmp.BB->getName().size(); - }); + // Since multiple comparisons can come from the same basic block + // (when using select inst) don't want to repeat same name twice + UniqueVector UniqueNames; + for (const auto &B : Comparisons) + UniqueNames.insert(B.BB->getName()); + const int size = std::accumulate( + UniqueNames.begin(), UniqueNames.end(), 0, + [](int i, const StringRef &Name) { return i + Name.size(); }); if (size == 0) return StringRef("", 0); @@ -582,16 +845,17 @@ class MergedBlockName { Scratch.clear(); // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for // separators. - Scratch.reserve(size + Comparisons.size() - 1); + Scratch.reserve(size + UniqueNames.size() - 1); const auto append = [this](StringRef str) { Scratch.append(str.begin(), str.end()); }; - append(Comparisons[0].BB->getName()); - for (int I = 1, E = Comparisons.size(); I < E; ++I) { - const BasicBlock *const BB = Comparisons[I].BB; - if (!BB->getName().empty()) { + // UniqueVector's index starts at 1 + append(UniqueNames[1]); + for (int I = 2, E = UniqueNames.size(); I <= E; ++I) { + StringRef BBName = UniqueNames[I]; + if (!BBName.empty()) { append("+"); - append(BB->getName()); + append(BBName); } } return Scratch.str(); @@ -599,15 +863,59 @@ class MergedBlockName { }; } // namespace +// Add a branch to the next basic block in the chain. +void updateBranching(Value *CondResult, IRBuilder<> &Builder, BasicBlock *BB, + BasicBlock *const NextCmpBlock, PHINode &Phi, + LLVMContext &Context, const TargetLibraryInfo &TLI, + AliasAnalysis &AA, DomTreeUpdater &DTU) { + BasicBlock *const PhiBB = Phi.getParent(); + if (NextCmpBlock == PhiBB) { + // Continue to phi, passing it the comparison result. + Builder.CreateBr(PhiBB); + Phi.addIncoming(CondResult, BB); + DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}}); + } else { + // Continue to next block if equal, exit to phi else. + Builder.CreateCondBr(CondResult, NextCmpBlock, PhiBB); + Phi.addIncoming(ConstantInt::getFalse(Context), BB); + DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock}, + {DominatorTree::Insert, BB, PhiBB}}); + } +} + +// Builds global constant-struct to compare to pointer during memcmp(). +// Has to be global in order for expand-memcmp pass to be able to fold +// constants. +GlobalVariable *buildConstantStruct(ArrayRef &Comparisons, + IRBuilder<> &Builder, LLVMContext &Context, + Module &M) { + std::vector Constants; + std::vector Types; + + for (const auto &BceBlock : Comparisons) { + assert(isa(BceBlock.getCmp()) && + "Const-cmp-chain can only contain const comparisons"); + auto *ConstCmp = cast(BceBlock.getCmp()); + Constants.emplace_back(ConstCmp->Const); + Types.emplace_back(ConstCmp->Lhs.LoadI->getType()); + } + auto *StructType = StructType::get( + Context, Types, /* currently only matches packed offsets */ true); + auto *StructConstant = ConstantStruct::get(StructType, Constants); + + return new GlobalVariable(M, StructType, true, GlobalVariable::PrivateLinkage, + StructConstant, "memcmp_const_op"); +} + // Merges the given contiguous comparison blocks into one memcmp block. -static BasicBlock *mergeComparisons(ArrayRef Comparisons, +static BasicBlock *mergeComparisons(ArrayRef Comparisons, BasicBlock *const InsertBefore, BasicBlock *const NextCmpBlock, - PHINode &Phi, const TargetLibraryInfo &TLI, + PHINode &Phi, LLVMContext &Context, + const TargetLibraryInfo &TLI, AliasAnalysis &AA, DomTreeUpdater &DTU) { - assert(!Comparisons.empty() && "merging zero comparisons"); - LLVMContext &Context = NextCmpBlock->getContext(); - const BCECmpBlock &FirstCmp = Comparisons[0]; + assert(Comparisons.size() > 1 && "merging multiple comparisons"); + const SingleBCECmpBlock &FirstCmp = Comparisons[0]; // Create a new cmp block before next cmp block. BasicBlock *const BB = @@ -616,74 +924,81 @@ static BasicBlock *mergeComparisons(ArrayRef Comparisons, IRBuilder<> Builder(BB); // Add the GEPs from the first BCECmpBlock. Value *Lhs, *Rhs; - if (FirstCmp.Lhs().GEP) - Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone()); + if (FirstCmp.Lhs()->GEP) + Lhs = Builder.Insert(FirstCmp.Lhs()->GEP->clone()); else - Lhs = FirstCmp.Lhs().LoadI->getPointerOperand(); - if (FirstCmp.Rhs().GEP) - Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone()); - else - Rhs = FirstCmp.Rhs().LoadI->getPointerOperand(); + Lhs = FirstCmp.Lhs()->LoadI->getPointerOperand(); - Value *IsEqual = nullptr; + if (isa(FirstCmp.getCmp())) { + Rhs = buildConstantStruct(Comparisons, Builder, Context, *Phi.getModule()); + } else { + auto *FirstBceCmp = cast(FirstCmp.getCmp()); + if (FirstBceCmp->Rhs.GEP) + Rhs = Builder.Insert(FirstBceCmp->Rhs.GEP->clone()); + else + Rhs = FirstBceCmp->Rhs.LoadI->getPointerOperand(); + } LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> " << BB->getName() << "\n"); // If there is one block that requires splitting, we do it now, i.e. // just before we know we will collapse the chain. The instructions // can be executed before any of the instructions in the chain. - const auto ToSplit = llvm::find_if( - Comparisons, [](const BCECmpBlock &B) { return B.RequireSplit; }); + const auto *ToSplit = llvm::find_if( + Comparisons, [](const SingleBCECmpBlock &B) { return B.RequireSplit; }); if (ToSplit != Comparisons.end()) { LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n"); ToSplit->split(BB, AA); } - if (Comparisons.size() == 1) { - LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n"); - // Use clone to keep the metadata - Instruction *const LhsLoad = Builder.Insert(FirstCmp.Lhs().LoadI->clone()); - Instruction *const RhsLoad = Builder.Insert(FirstCmp.Rhs().LoadI->clone()); - LhsLoad->replaceUsesOfWith(LhsLoad->getOperand(0), Lhs); - RhsLoad->replaceUsesOfWith(RhsLoad->getOperand(0), Rhs); - // There are no blocks to merge, just do the comparison. - IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad); - } else { - const unsigned TotalSizeBits = std::accumulate( - Comparisons.begin(), Comparisons.end(), 0u, - [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); }); - - // memcmp expects a 'size_t' argument and returns 'int'. - unsigned SizeTBits = TLI.getSizeTSize(*Phi.getModule()); - unsigned IntBits = TLI.getIntSize(); - - // Create memcmp() == 0. - const auto &DL = Phi.getDataLayout(); - Value *const MemCmpCall = emitMemCmp( - Lhs, Rhs, - ConstantInt::get(Builder.getIntNTy(SizeTBits), TotalSizeBits / 8), - Builder, DL, &TLI); - IsEqual = Builder.CreateICmpEQ( - MemCmpCall, ConstantInt::get(Builder.getIntNTy(IntBits), 0)); - } - - BasicBlock *const PhiBB = Phi.getParent(); - // Add a branch to the next basic block in the chain. - if (NextCmpBlock == PhiBB) { - // Continue to phi, passing it the comparison result. - Builder.CreateBr(PhiBB); - Phi.addIncoming(IsEqual, BB); - DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}}); - } else { - // Continue to next block if equal, exit to phi else. - Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB); - Phi.addIncoming(ConstantInt::getFalse(Context), BB); - DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock}, - {DominatorTree::Insert, BB, PhiBB}}); - } + // memcmp expects a 'size_t' argument and returns 'int'. + unsigned SizeTBits = TLI.getSizeTSize(*Phi.getModule()); + unsigned IntBits = TLI.getIntSize(); + const unsigned TotalSizeBits = + std::accumulate(Comparisons.begin(), Comparisons.end(), 0u, + [](int Size, const SingleBCECmpBlock &C) { + return Size + C.getCmp()->SizeBits; + }); + + // Create memcmp() == 0. + const auto &DL = Phi.getDataLayout(); + Value *const MemCmpCall = emitMemCmp( + Lhs, Rhs, + ConstantInt::get(Builder.getIntNTy(SizeTBits), TotalSizeBits / 8), + Builder, DL, &TLI); + Value *IsEqual = Builder.CreateICmpEQ( + MemCmpCall, ConstantInt::get(Builder.getIntNTy(IntBits), 0)); + + updateBranching(IsEqual, Builder, BB, NextCmpBlock, Phi, Context, TLI, AA, + DTU); return BB; } +// Keep existing block if it isn't merged. Only change the branches. +// Also handles not splitting mult-blocks that use select instructions. +static BasicBlock *updateOriginalBlock(BasicBlock *const BB, + BasicBlock *const InsertBefore, + BasicBlock *const NextCmpBlock, + PHINode &Phi, LLVMContext &Context, + const TargetLibraryInfo &TLI, + AliasAnalysis &AA, DomTreeUpdater &DTU) { + BasicBlock *MultBB = BasicBlock::Create( + Context, BB->getName(), NextCmpBlock->getParent(), InsertBefore); + auto *const BranchI = cast(BB->getTerminator()); + Value *CondResult = nullptr; + if (BranchI->isUnconditional()) + CondResult = Phi.getIncomingValueForBlock(BB); + else + CondResult = cast(BranchI->getCondition()); + // Transfer all instructions except the branching terminator to the new block. + MultBB->splice(MultBB->end(), BB, BB->begin(), std::prev(BB->end())); + IRBuilder<> Builder(MultBB); + updateBranching(CondResult, Builder, MultBB, NextCmpBlock, Phi, Context, TLI, + AA, DTU); + + return MultBB; +} + bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, DomTreeUpdater &DTU) { assert(atLeastOneMerged() && "simplifying trivial BCECmpChain"); @@ -694,9 +1009,24 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, // so that the next block is always available to branch to. BasicBlock *InsertBefore = EntryBlock_; BasicBlock *NextCmpBlock = Phi_.getParent(); - for (const auto &Blocks : reverse(MergedBlocks_)) { - InsertBefore = NextCmpBlock = mergeComparisons( - Blocks, InsertBefore, NextCmpBlock, Phi_, TLI, AA, DTU); + SmallDenseSet ExistingBlocksToKeep; + LLVMContext &Context = NextCmpBlock->getContext(); + for (const auto &Cmps : reverse(MergedBlocks_)) { + // If there is only a single comparison then nothing should + // be merged and can use original block. + if (Cmps.size() == 1) { + // If a comparison from a mult-block is already handled + // then don't emit same block again. + BasicBlock *const BB = Cmps[0].BB; + if (ExistingBlocksToKeep.contains(BB)) + continue; + ExistingBlocksToKeep.insert(BB); + InsertBefore = NextCmpBlock = updateOriginalBlock( + BB, InsertBefore, NextCmpBlock, Phi_, Context, TLI, AA, DTU); + } else { + InsertBefore = NextCmpBlock = mergeComparisons( + Cmps, InsertBefore, NextCmpBlock, Phi_, Context, TLI, AA, DTU); + } } // Replace the original cmp chain with the new cmp chain by pointing all @@ -726,7 +1056,12 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA, // Delete merged blocks. This also removes incoming values in phi. SmallVector DeadBlocks; for (const auto &Blocks : MergedBlocks_) { - for (const BCECmpBlock &Block : Blocks) { + for (const SingleBCECmpBlock &Block : Blocks) { + // Many single blocks can refer to the same multblock coming from an + // select instruction. + // TODO: preferrably use a set instead + if (llvm::is_contained(DeadBlocks, Block.BB)) + continue; LLVM_DEBUG(dbgs() << "Deleting merged block " << Block.BB->getName() << "\n"); DeadBlocks.push_back(Block.BB); @@ -773,6 +1108,12 @@ std::vector getOrderedBlocks(PHINode &Phi, return Blocks; } +template bool isInvalidPrevBlock(PHINode &Phi, unsigned I) { + auto *IncomingValue = Phi.getIncomingValue(I); + return !isa(IncomingValue) || + cast(IncomingValue)->getParent() != Phi.getIncomingBlock(I); +} + bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, DomTreeUpdater &DTU) { LLVM_DEBUG(dbgs() << "processPhi()\n"); @@ -804,9 +1145,8 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, LLVM_DEBUG(dbgs() << "skip: several non-constant values\n"); return false; } - if (!isa(Phi.getIncomingValue(I)) || - cast(Phi.getIncomingValue(I))->getParent() != - Phi.getIncomingBlock(I)) { + if (isInvalidPrevBlock(Phi, I) && + isInvalidPrevBlock(Phi, I)) { // Non-constant incoming value is not from a cmp instruction or not // produced by the last block. We could end up processing the value // producing block more than once. @@ -831,6 +1171,7 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, const auto Blocks = getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues()); + if (Blocks.empty()) return false; BCECmpChain CmpChain(Blocks, Phi, AA); @@ -839,6 +1180,11 @@ bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA, return false; } + if (CmpChain.multBlockOnlyPartiallyMerged()) { + LLVM_DEBUG(dbgs() << "chain uses not fully merged basic block, no merge\n"); + return false; + } + return CmpChain.simplify(TLI, AA, DTU); } diff --git a/llvm/test/Transforms/MergeICmps/X86/const-cmp-bb.ll b/llvm/test/Transforms/MergeICmps/X86/const-cmp-bb.ll new file mode 100644 index 0000000000000..3956c62579986 --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/const-cmp-bb.ll @@ -0,0 +1,57 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S 2>&1 | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes='mergeicmps,expand-memcmp' -verify-dom-info -S 2>&1 | FileCheck %s --check-prefix=EXPANDED + +; adjacent byte pointer accesses compared to constants, should be merged into single memcmp, spanning multiple basic blocks + +; CHECK: [[MEMCMP_OP:@memcmp_const_op]] = private constant <{ i8, i8, i8 }> <{ i8 -1, i8 -56, i8 -66 }> + +; Global should be removed once its constant has been folded. +; EXPANDED-NOT: [[MEMCMP_OP:@memcmp_const_op]] = private constant <{ i8, i8, i8 }> <{ i8 -1, i8 -56, i8 -66 }> + +define zeroext i1 @test(ptr nocapture noundef nonnull dereferenceable(3) %p) local_unnamed_addr #0 { +; CHECK-LABEL: @test( +; CHECK-NEXT: "entry+land.lhs.true+land.rhs": +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[p:%.*]], ptr [[MEMCMP_OP]], i64 3) +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br label [[LAND_END5:%.*]] +; CHECK: land.end: +; CHECK-NEXT: ret i1 [[TMP1]] +; +; EXPANDED-LABEL: define zeroext i1 @test( +; EXPANDED-SAME: ptr nocapture noundef nonnull dereferenceable(3) [[P:%.*]]) local_unnamed_addr { +; EXPANDED-NEXT: "entry+land.lhs.true+land.rhs": +; EXPANDED-NEXT: [[TMP0:%.*]] = load i16, ptr [[P]], align 1 +; EXPANDED-NEXT: [[TMP8:%.*]] = xor i16 [[TMP0]], -14081 +; EXPANDED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[P]], i64 2 +; EXPANDED-NEXT: [[TMP3:%.*]] = load i8, ptr [[TMP2]], align 1 +; EXPANDED-NEXT: [[TMP4:%.*]] = zext i8 [[TMP3]] to i16 +; EXPANDED-NEXT: [[TMP5:%.*]] = xor i16 [[TMP4]], 190 +; EXPANDED-NEXT: [[TMP6:%.*]] = or i16 [[TMP8]], [[TMP5]] +; EXPANDED-NEXT: [[TMP7:%.*]] = icmp ne i16 [[TMP6]], 0 +; EXPANDED-NEXT: [[CMP:%.*]] = zext i1 [[TMP7]] to i32 +; EXPANDED-NEXT: [[RES:%.*]] = icmp eq i32 [[CMP]], 0 +; EXPANDED-NEXT: br label %[[LAND_END:.*]] +; EXPANDED: [[LAND_END]]: +; EXPANDED-NEXT: ret i1 [[RES]] +; +entry: + %0 = load i8, ptr %p, align 1 + %cmp = icmp eq i8 %0, -1 + br i1 %cmp, label %land.lhs.true, label %land.end + +land.lhs.true: ; preds = %entry + %arrayidx1 = getelementptr inbounds nuw i8, ptr %p, i64 1 + %1 = load i8, ptr %arrayidx1, align 1 + %cmp5 = icmp eq i8 %1, -56 + br i1 %cmp5, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p, i64 2 + %2 = load i8, ptr %arrayidx2, align 1 + %cmp8 = icmp eq i8 %2, -66 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true, %entry + %3 = phi i1 [ false, %land.lhs.true ], [ false, %entry ], [ %cmp8, %land.rhs ] + ret i1 %3 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll b/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll index bc6beefb2caee..65156697f1892 100644 --- a/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll +++ b/llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll @@ -11,10 +11,10 @@ define zeroext i1 @opeq1( ; CHECK-LABEL: @opeq1( ; CHECK-NEXT: entry2: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[S:%.*]], ptr [[A:%.*]], i64 0, i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[S]], ptr [[B:%.*]], i64 0, i32 2 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[S]], ptr [[B:%.*]], i64 0, i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP1]], [[TMP3]] ; CHECK-NEXT: br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]] ; CHECK: "land.rhs.i+land.rhs.i.2": ; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i64 8) @@ -22,10 +22,10 @@ define zeroext i1 @opeq1( ; CHECK-NEXT: br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]] ; CHECK: land.rhs.i.31: ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 3 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP7]], [[TMP9]] ; CHECK-NEXT: br label [[OPEQ1_EXIT]] ; CHECK: opeq1.exit: ; CHECK-NEXT: [[TMP11:%.*]] = phi i1 [ [[TMP10]], [[LAND_RHS_I_31]] ], [ false, %"land.rhs.i+land.rhs.i.2" ], [ false, [[ENTRY2:%.*]] ] diff --git a/llvm/test/Transforms/MergeICmps/X86/many-const-cmp-select.ll b/llvm/test/Transforms/MergeICmps/X86/many-const-cmp-select.ll new file mode 100644 index 0000000000000..c4c2fe7e6a222 --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/many-const-cmp-select.ll @@ -0,0 +1,72 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S 2>&1 | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes='mergeicmps,expand-memcmp' -verify-dom-info -S 2>&1 | FileCheck %s --check-prefix=EXPANDED + +; Can merge contiguous const-comparison basic blocks that include a select statement. + +; CHECK: [[MEMCMP_OP0:@memcmp_const_op]] = private constant <{ i8, i8 }> <{ i8 2, i8 7 }> +; CHECK: [[MEMCMP_OP1:@memcmp_const_op.1]] = private constant <{ i8, i8, i8, i8 }> <{ i8 -1, i8 -56, i8 -66, i8 1 }> + +; EXPANDED-NOT: [[MEMCMP_OP0:@memcmp_const_op]] = private constant <{ i8, i8 }> <{ i8 2, i8 7 }> +; EXPANDED-NOT: [[MEMCMP_OP1:@memcmp_const_op.1]] = private constant <{ i8, i8, i8, i8 }> <{ i8 -1, i8 -56, i8 -66, i8 1 }> + +define dso_local zeroext i1 @is_all_ones_many(ptr nocapture noundef nonnull dereferenceable(24) %p) local_unnamed_addr { +; CHECK-LABEL: @is_all_ones_many( +; CHECK-NEXT: "entry+land.lhs.true11": +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[P:%.*]], ptr [[MEMCMP_OP1]], i64 4) +; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br i1 [[TMP0]], label [[NEXT_MEMCMP:%.*]], label [[LAND_END:%.*]] +; CHECK: "land.lhs.true16+land.lhs.true21": +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 6 +; CHECK-NEXT: [[MEMCMP1:%.*]] = call i32 @memcmp(ptr [[TMP1]], ptr [[MEMCMP_OP0]], i64 2) +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i32 [[MEMCMP1]], 0 +; CHECK-NEXT: br i1 [[TMP2]], label [[LAST_CMP:%.*]], label [[LAND_END]] +; CHECK: land.rhs1: +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 9 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[TMP6:%.*]] = phi i1 [ [[TMP5]], [[LAST_CMP]] ], [ false, [[NEXT_MEMCMP]] ], [ false, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i1 [[TMP6]] +; +entry: + %0 = load i8, ptr %p, align 1 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %p, i64 1 + %1 = load i8, ptr %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p, i64 2 + %2 = load i8, ptr %arrayidx2, align 1 + %cmp = icmp eq i8 %0, -1 + %cmp5 = icmp eq i8 %1, -56 + %or.cond = select i1 %cmp, i1 %cmp5, i1 false + %cmp9 = icmp eq i8 %2, -66 + %or.cond28 = select i1 %or.cond, i1 %cmp9, i1 false + br i1 %or.cond28, label %land.lhs.true11, label %land.end + +land.lhs.true11: ; preds = %entry + %arrayidx12 = getelementptr inbounds nuw i8, ptr %p, i64 3 + %3 = load i8, ptr %arrayidx12, align 1 + %cmp14 = icmp eq i8 %3, 1 + br i1 %cmp14, label %land.lhs.true16, label %land.end + +land.lhs.true16: ; preds = %land.lhs.true11 + %arrayidx17 = getelementptr inbounds nuw i8, ptr %p, i64 6 + %4 = load i8, ptr %arrayidx17, align 1 + %cmp19 = icmp eq i8 %4, 2 + br i1 %cmp19, label %land.lhs.true21, label %land.end + +land.lhs.true21: ; preds = %land.lhs.true16 + %arrayidx22 = getelementptr inbounds nuw i8, ptr %p, i64 7 + %5 = load i8, ptr %arrayidx22, align 1 + %cmp24 = icmp eq i8 %5, 7 + br i1 %cmp24, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true21 + %arrayidx26 = getelementptr inbounds nuw i8, ptr %p, i64 9 + %6 = load i8, ptr %arrayidx26, align 1 + %cmp28 = icmp eq i8 %6, 9 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true21, %land.lhs.true16, %land.lhs.true11, %entry + %7 = phi i1 [ false, %land.lhs.true21 ], [ false, %land.lhs.true16 ], [ false, %land.lhs.true11 ], [ false, %entry ], [ %cmp28, %land.rhs ] + ret i1 %7 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/mixed-cmp-bb-select.ll b/llvm/test/Transforms/MergeICmps/X86/mixed-cmp-bb-select.ll new file mode 100644 index 0000000000000..d81aecc76ea4a --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/mixed-cmp-bb-select.ll @@ -0,0 +1,67 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes='mergeicmps,expand-memcmp' -verify-dom-info -S 2>&1 | FileCheck %s --check-prefix=EXPANDED + +; Tests if a mixed chain of comparisons (including a select block) can still be merged into two memcmp calls. + +; CHECK: [[MEMCMP_OP0:@memcmp_const_op]] = private constant <{ i32, i32, i32 }> <{ i32 255, i32 200, i32 100 }> +; EXPANDED-NOT: [[MEMCMP_OP0:@memcmp_const_op]] = private constant <{ i32, i32, i32 }> <{ i32 255, i32 200, i32 100 }> + +define dso_local noundef zeroext i1 @cmp_mixed( + ptr noundef nonnull readonly align 4 captures(none) dereferenceable(20) %a, + ptr noundef nonnull readonly align 4 captures(none) dereferenceable(20) %b) local_unnamed_addr { +; CHECK-LABEL: @cmp_mixed( +; CHECK: "land.lhs.true+land.lhs.true10+land.lhs.true4": +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A:%.*]], ptr [[B:%.*]], i64 6) +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[ENTRY_LAND_RHS:%.*]], label [[LAND_END:%.*]] +; CHECK: "entry+land.rhs+land.lhs.true4": +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[MEMCMP2:%.*]] = call i32 @memcmp(ptr [[TMP0]], ptr [[MEMCMP_OP0]], i64 12) +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[MEMCMP2]], 0 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[CMP2]], [[ENTRY_LAND_RHS]] ], [ false, [[LAND_LHS_TRUE10:%.*]] ] +; CHECK-NEXT: ret i1 [[TMP4]] +; +entry: + %e = getelementptr inbounds nuw i8, ptr %a, i64 8 + %0 = load i32, ptr %e, align 4 + %cmp = icmp eq i32 %0, 255 + br i1 %cmp, label %land.lhs.true, label %land.end + +land.lhs.true: ; preds = %entry + %1 = load i32, ptr %a, align 4 + %2 = load i32, ptr %b, align 4 + %cmp3 = icmp eq i32 %1, %2 + br i1 %cmp3, label %land.lhs.true4, label %land.end + +land.lhs.true4: ; preds = %land.lhs.true + %c = getelementptr inbounds nuw i8, ptr %a, i64 5 + %3 = load i8, ptr %c, align 1 + %c5 = getelementptr inbounds nuw i8, ptr %b, i64 5 + %4 = load i8, ptr %c5, align 1 + %cmp7 = icmp eq i8 %3, %4 + %g = getelementptr inbounds nuw i8, ptr %a, i64 16 + %5 = load i32, ptr %g, align 4 + %cmp9 = icmp eq i32 %5, 100 + %or.cond = select i1 %cmp7, i1 %cmp9, i1 false + br i1 %or.cond, label %land.lhs.true10, label %land.end + +land.lhs.true10: ; preds = %land.lhs.true4 + %b11 = getelementptr inbounds nuw i8, ptr %a, i64 4 + %6 = load i8, ptr %b11, align 4 + %b13 = getelementptr inbounds nuw i8, ptr %b, i64 4 + %7 = load i8, ptr %b13, align 4 + %cmp15 = icmp eq i8 %6, %7 + br i1 %cmp15, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true10 + %f = getelementptr inbounds nuw i8, ptr %a, i64 12 + %8 = load i32, ptr %f, align 4 + %cmp16 = icmp eq i32 %8, 200 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true10, %land.lhs.true4, %land.lhs.true, %entry + %9 = phi i1 [ false, %land.lhs.true10 ], [ false, %land.lhs.true4 ], [ false, %land.lhs.true ], [ false, %entry ], [ %cmp16, %land.rhs ] + ret i1 %9 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/mixed-cmp-split.ll b/llvm/test/Transforms/MergeICmps/X86/mixed-cmp-split.ll new file mode 100644 index 0000000000000..3e4e4c3eaf6be --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/mixed-cmp-split.ll @@ -0,0 +1,135 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s + +declare void @foo(...) + +; Tests that if both const-cmp and bce-cmp chains can be merged that the splitted block is still at the beginning. + +; CHECK: [[MEMCMP_OP0:@memcmp_const_op]] = private constant <{ i32, i32, i32 }> <{ i32 255, i32 200, i32 100 }> +; CHECK: [[MEMCMP_OP1:@memcmp_const_op.1]] = private constant <{ i32, i32, i32 }> <{ i32 255, i32 200, i32 100 }> + +define dso_local noundef zeroext i1 @cmp_mixed_const_first(ptr noundef nonnull align 4 dereferenceable(20) %a, ptr noundef nonnull align 4 dereferenceable(20) %b) local_unnamed_addr { +; CHECK-LABEL: @cmp_mixed_const_first( +; This merged-block should come first as it should be split. +; CHECK: "entry+land.rhs+land.lhs.true8": +; CHECK-NEXT: call void (...) @foo() #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A:%.*]], i64 8 +; CHECK-NEXT: [[MEMCMP0:%.*]] = call i32 @memcmp(ptr [[TMP0]], ptr [[MEMCMP_OP0]], i64 12) +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[MEMCMP0]], 0 +; CHECK-NEXT: br i1 [[CMP0]], label [[LAND_LHS_TRUE10:%.*]], label [[LAND_END:%.*]] +; CHECK: "land.lhs.true+land.lhs.true10+land.lhs.true4": +; CHECK-NEXT: [[MEMCMP1:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B:%.*]], i64 6) +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[MEMCMP1]], 0 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ [[CMP1]], [[LAND_LHS_TRUE10]] ], [ false, [[ENTRY_LAND_RHS:%.*]] ] +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + %e = getelementptr inbounds nuw i8, ptr %a, i64 8 + %0 = load i32, ptr %e, align 4 + %cmp = icmp eq i32 %0, 255 + call void (...) @foo() inaccessiblememonly + br i1 %cmp, label %land.lhs.true, label %land.end + +land.lhs.true: ; preds = %entry + %1 = load i32, ptr %a, align 4 + %2 = load i32, ptr %b, align 4 + %cmp3 = icmp eq i32 %1, %2 + br i1 %cmp3, label %land.lhs.true4, label %land.end + +land.lhs.true4: ; preds = %land.lhs.true + %c = getelementptr inbounds nuw i8, ptr %a, i64 5 + %3 = load i8, ptr %c, align 1 + %c5 = getelementptr inbounds nuw i8, ptr %b, i64 5 + %4 = load i8, ptr %c5, align 1 + %cmp7 = icmp eq i8 %3, %4 + br i1 %cmp7, label %land.lhs.true8, label %land.end + +land.lhs.true8: ; preds = %land.lhs.true4 + %g = getelementptr inbounds nuw i8, ptr %a, i64 16 + %5 = load i32, ptr %g, align 4 + %cmp9 = icmp eq i32 %5, 100 + br i1 %cmp9, label %land.lhs.true10, label %land.end + +land.lhs.true10: ; preds = %land.lhs.true8 + %b11 = getelementptr inbounds nuw i8, ptr %a, i64 4 + %6 = load i8, ptr %b11, align 4 + %b13 = getelementptr inbounds nuw i8, ptr %b, i64 4 + %7 = load i8, ptr %b13, align 4 + %cmp15 = icmp eq i8 %6, %7 + br i1 %cmp15, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true10 + %f = getelementptr inbounds nuw i8, ptr %a, i64 12 + %8 = load i32, ptr %f, align 4 + %cmp16 = icmp eq i32 %8, 200 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true10, %land.lhs.true8, %land.lhs.true4, %land.lhs.true, %entry + %9 = phi i1 [ false, %land.lhs.true10 ], [ false, %land.lhs.true8 ], [ false, %land.lhs.true4 ], [ false, %land.lhs.true ], [ false, %entry ], [ %cmp16, %land.rhs ] + ret i1 %9 +} + +; If block to split it in BCE-comparison that that block should be first. + +define dso_local noundef zeroext i1 @cmp_mixed_bce_first( + ptr noundef nonnull readonly align 4 captures(none) dereferenceable(20) %a, + ptr noundef nonnull readonly align 4 captures(none) dereferenceable(20) %b) local_unnamed_addr { +; CHECK-LABEL: @cmp_mixed_bce_first( +; CHECK: "entry+land.lhs.true10+land.lhs.true4": +; CHECK-NEXT: call void (...) @foo() #[[ATTR2:[0-9]+]] +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A:%.*]], ptr [[B:%.*]], i64 6) +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[LAND_LHS_TRUE:%.*]], label [[LAND_END:%.*]] +; CHECK: "land.lhs.true+land.rhs+land.lhs.true4": +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[MEMCMP2:%.*]] = call i32 @memcmp(ptr [[TMP0]], ptr [[MEMCMP_OP1]], i64 12) +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[MEMCMP2]], 0 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[CMP2]], [[LAND_LHS_TRUE]] ], [ false, [[ENTRY:%.*]] ] +; CHECK-NEXT: ret i1 [[TMP4]] +; +entry: + %0 = load i32, ptr %a, align 4 + %1 = load i32, ptr %b, align 4 + call void (...) @foo() inaccessiblememonly + %cmp3 = icmp eq i32 %0, %1 + br i1 %cmp3, label %land.lhs.true, label %land.end + +land.lhs.true: + %e = getelementptr inbounds nuw i8, ptr %a, i64 8 + %2 = load i32, ptr %e, align 4 + %cmp = icmp eq i32 %2, 255 + br i1 %cmp, label %land.lhs.true4, label %land.end + +land.lhs.true4: ; preds = %land.lhs.true + %c = getelementptr inbounds nuw i8, ptr %a, i64 5 + %3 = load i8, ptr %c, align 1 + %c5 = getelementptr inbounds nuw i8, ptr %b, i64 5 + %4 = load i8, ptr %c5, align 1 + %cmp7 = icmp eq i8 %3, %4 + %g = getelementptr inbounds nuw i8, ptr %a, i64 16 + %5 = load i32, ptr %g, align 4 + %cmp9 = icmp eq i32 %5, 100 + %or.cond = select i1 %cmp7, i1 %cmp9, i1 false + br i1 %or.cond, label %land.lhs.true10, label %land.end + +land.lhs.true10: ; preds = %land.lhs.true4 + %b11 = getelementptr inbounds nuw i8, ptr %a, i64 4 + %6 = load i8, ptr %b11, align 4 + %b13 = getelementptr inbounds nuw i8, ptr %b, i64 4 + %7 = load i8, ptr %b13, align 4 + %cmp15 = icmp eq i8 %6, %7 + br i1 %cmp15, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true10 + %f = getelementptr inbounds nuw i8, ptr %a, i64 12 + %8 = load i32, ptr %f, align 4 + %cmp16 = icmp eq i32 %8, 200 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true10, %land.lhs.true4, %land.lhs.true, %entry + %9 = phi i1 [ false, %land.lhs.true10 ], [ false, %land.lhs.true4 ], [ false, %land.lhs.true ], [ false, %entry ], [ %cmp16, %land.rhs ] + ret i1 %9 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/mixed-comparisons.ll b/llvm/test/Transforms/MergeICmps/X86/mixed-comparisons.ll new file mode 100644 index 0000000000000..b5e85d3a09dfb --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/mixed-comparisons.ll @@ -0,0 +1,69 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s + +; Tests if a mixed chain of comparisons can still be merged into two memcmp calls. +; a.e == 255 && a.a == b.a && a.c == b.c && a.g == 100 && a.b == b.b && a.f == 200; + +; CHECK: [[MEMCMP_OP0:@memcmp_const_op]] = private constant <{ i32, i32, i32 }> <{ i32 255, i32 200, i32 100 }> + +define dso_local noundef zeroext i1 @cmp_mixed(ptr noundef nonnull align 4 dereferenceable(20) %a, ptr noundef nonnull align 4 dereferenceable(20) %b) local_unnamed_addr { +; CHECK-LABEL: @cmp_mixed( +; This is the classic BCE comparison block +; CHECK: "land.lhs.true+land.lhs.true10+land.lhs.true4": +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A:%.*]], ptr [[B:%.*]], i64 6) +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[ENTRY_LAND_RHS:%.*]], label [[LAND_END:%.*]] +; This is the new BCE to constant comparison block +; CHECK: "entry+land.rhs+land.lhs.true8": +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 8 +; CHECK-NEXT: [[MEMCMP2:%.*]] = call i32 @memcmp(ptr [[TMP0]], ptr [[MEMCMP_OP0]], i64 12) +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[MEMCMP2]], 0 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[TMP4:%.*]] = phi i1 [ [[CMP2]], [[ENTRY_LAND_RHS]] ], [ false, [[LAND_LHS_TRUE10:%.*]] ] +; CHECK-NEXT: ret i1 [[TMP4]] +; +entry: + %e = getelementptr inbounds nuw i8, ptr %a, i64 8 + %0 = load i32, ptr %e, align 4 + %cmp = icmp eq i32 %0, 255 + br i1 %cmp, label %land.lhs.true, label %land.end + +land.lhs.true: ; preds = %entry + %1 = load i32, ptr %a, align 4 + %2 = load i32, ptr %b, align 4 + %cmp3 = icmp eq i32 %1, %2 + br i1 %cmp3, label %land.lhs.true4, label %land.end + +land.lhs.true4: ; preds = %land.lhs.true + %c = getelementptr inbounds nuw i8, ptr %a, i64 5 + %3 = load i8, ptr %c, align 1 + %c5 = getelementptr inbounds nuw i8, ptr %b, i64 5 + %4 = load i8, ptr %c5, align 1 + %cmp7 = icmp eq i8 %3, %4 + br i1 %cmp7, label %land.lhs.true8, label %land.end + +land.lhs.true8: ; preds = %land.lhs.true4 + %g = getelementptr inbounds nuw i8, ptr %a, i64 16 + %5 = load i32, ptr %g, align 4 + %cmp9 = icmp eq i32 %5, 100 + br i1 %cmp9, label %land.lhs.true10, label %land.end + +land.lhs.true10: ; preds = %land.lhs.true8 + %b11 = getelementptr inbounds nuw i8, ptr %a, i64 4 + %6 = load i8, ptr %b11, align 4 + %b13 = getelementptr inbounds nuw i8, ptr %b, i64 4 + %7 = load i8, ptr %b13, align 4 + %cmp15 = icmp eq i8 %6, %7 + br i1 %cmp15, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true10 + %f = getelementptr inbounds nuw i8, ptr %a, i64 12 + %8 = load i32, ptr %f, align 4 + %cmp16 = icmp eq i32 %8, 200 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true10, %land.lhs.true8, %land.lhs.true4, %land.lhs.true, %entry + %9 = phi i1 [ false, %land.lhs.true10 ], [ false, %land.lhs.true8 ], [ false, %land.lhs.true4 ], [ false, %land.lhs.true ], [ false, %entry ], [ %cmp16, %land.rhs ] + ret i1 %9 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/mixed-type-const-comparisons.ll b/llvm/test/Transforms/MergeICmps/X86/mixed-type-const-comparisons.ll new file mode 100644 index 0000000000000..3a5bf5585d46a --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/mixed-type-const-comparisons.ll @@ -0,0 +1,78 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s + +; Tests if a const-cmp-chain of different types can still be merged. +; This is usually the case when comparing different struct fields to constants. + +; CHECK: [[MEMCMP_OP0:@memcmp_const_op]] = private constant <{ i32, i8 }> <{ i32 3, i8 100 }> +; CHECK: [[MEMCMP_OP1:@memcmp_const_op.1]] = private constant <{ i32, i8, i8 }> <{ i32 200, i8 3, i8 100 }> + +; Can only merge gep 0 with gep 4 due to alignment since gep 8 is not directly adjacent to gep 4. +define dso_local zeroext i1 @is_all_ones_struct( +; CHECK-LABEL: @is_all_ones_struct( +; CHECK: entry1: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i64 8 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TMP1]], 200 +; CHECK-NEXT: br i1 [[CMP0]], label [[MERGED:%.*]], label [[LAND_END:%.*]] +; CHECK: "land.rhs+land.lhs.true": +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[P]], ptr [[MEMCMP_OP0]], i64 5) +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ [[CMP1]], [[MERGED]] ], [ false, %entry1 ] +; CHECK-NEXT: ret i1 [[RES]] +; + ptr noundef nonnull readonly align 4 captures(none) dereferenceable(24) %p) local_unnamed_addr { +entry: + %c = getelementptr inbounds nuw i8, ptr %p, i64 8 + %0 = load i32, ptr %c, align 4 + %cmp = icmp eq i32 %0, 200 + br i1 %cmp, label %land.lhs.true, label %land.end + +land.lhs.true: ; preds = %entry + %b = getelementptr inbounds nuw i8, ptr %p, i64 4 + %1 = load i8, ptr %b, align 4 + %cmp1 = icmp eq i8 %1, 100 + br i1 %cmp1, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true + %2 = load i32, ptr %p, align 4 + %cmp3 = icmp eq i32 %2, 3 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true, %entry + %3 = phi i1 [ false, %land.lhs.true ], [ false, %entry ], [ %cmp3, %land.rhs ] + ret i1 %3 +} + + +; Can also still merge select blocks with different types. +define dso_local noundef zeroext i1 @is_all_ones_struct_select_block( +; CHECK-LABEL: @is_all_ones_struct_select_block( +; CHECK: "entry+land.rhs": +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[P:%.*]], ptr [[MEMCMP_OP1]], i64 6) +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: ret i1 [[CMP1]] +; + ptr noundef nonnull readonly align 4 captures(none) dereferenceable(24) %p) local_unnamed_addr { +entry: + %0 = load i32, ptr %p, align 4 + %cmp = icmp eq i32 %0, 200 + %c = getelementptr inbounds nuw i8, ptr %p, i64 5 + %1 = load i8, ptr %c, align 1 + %cmp2 = icmp eq i8 %1, 100 + %or.cond = select i1 %cmp, i1 %cmp2, i1 false + br i1 %or.cond, label %land.rhs, label %land.end + +land.rhs: ; preds = %entry + %b3 = getelementptr inbounds nuw i8, ptr %p, i64 4 + %2 = load i8, ptr %b3, align 4 + %cmp5 = icmp eq i8 %2, 3 + br label %land.end + +land.end: ; preds = %land.rhs, %entry + %3 = phi i1 [ false, %entry ], [ %cmp5, %land.rhs ] + ret i1 %3 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/not-split-unmerged-select.ll b/llvm/test/Transforms/MergeICmps/X86/not-split-unmerged-select.ll new file mode 100644 index 0000000000000..d059609afe292 --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/not-split-unmerged-select.ll @@ -0,0 +1,169 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes='mergeicmps,expand-memcmp' -verify-dom-info -S 2>&1 | FileCheck %s --check-prefix=EXPANDED + +; No adjacent accesses to the same pointer so nothing should be merged. Select blocks won't get split. + +; CHECK: [[MEMCMP_OP:@memcmp_const_op]] = private constant <{ i8, i8 }> <{ i8 1, i8 9 }> +; EXPANDED-NOT: [[MEMCMP_OP:@memcmp_const_op]] = private constant <{ i8, i8 }> <{ i8 1, i8 9 }> + +define dso_local noundef zeroext i1 @unmergable_select( + ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) %p) local_unnamed_addr { +; CHECK-LABEL: @unmergable_select( +; CHECK: entry: +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i64 10 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[IDX0]], align 1 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[IDX1]], align 1 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[IDX2]], align 1 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i8 [[TMP0]], -1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP1]], -56 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[CMP0]], i1 [[CMP1]], i1 false +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[TMP2]], -66 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[SEL0]], i1 [[CMP2]], i1 false +; CHECK-NEXT: br i1 [[SEL1]], label [[LAND_LHS_11:%.*]], label [[LAND_END:%.*]] +; CHECK: land.lhs.true11: +; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[IDX3]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP3]], 1 +; CHECK-NEXT: br i1 [[CMP3]], label [[LAND_LHS_16:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true16: +; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 6 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[IDX4]], align 1 +; CHECK-NEXT: [[CMP4:%.*]] = icmp eq i8 [[TMP4]], 2 +; CHECK-NEXT: br i1 [[CMP4]], label [[LAND_LHS_21:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true21: +; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[IDX5]], align 1 +; CHECK-NEXT: [[CMP5:%.*]] = icmp eq i8 [[TMP5]], 7 +; CHECK-NEXT: br i1 [[CMP5]], label [[LAND_RHS:%.*]], label [[LAND_END]] +; CHECK: land.rhs: +; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 14 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[IDX6]], align 1 +; CHECK-NEXT: [[CMP6:%.*]] = icmp eq i8 [[TMP6]], 9 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ false, [[LAND_LHS_21]] ], [ false, [[LAND_LHS_16]] ], [ false, [[LAND_LHS_11]] ], [ false, %entry ], [ %cmp28, [[LAND_RHS]] ] +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 10 + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %p, i64 1 + %1 = load i8, ptr %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p, i64 3 + %2 = load i8, ptr %arrayidx2, align 1 + %cmp = icmp eq i8 %0, -1 + %cmp5 = icmp eq i8 %1, -56 + %or.cond = select i1 %cmp, i1 %cmp5, i1 false + %cmp9 = icmp eq i8 %2, -66 + %or.cond30 = select i1 %or.cond, i1 %cmp9, i1 false + br i1 %or.cond30, label %land.lhs.true11, label %land.end + +land.lhs.true11: ; preds = %entry + %arrayidx12 = getelementptr inbounds nuw i8, ptr %p, i64 12 + %3 = load i8, ptr %arrayidx12, align 1 + %cmp14 = icmp eq i8 %3, 1 + br i1 %cmp14, label %land.lhs.true16, label %land.end + +land.lhs.true16: ; preds = %land.lhs.true11 + %arrayidx17 = getelementptr inbounds nuw i8, ptr %p, i64 6 + %4 = load i8, ptr %arrayidx17, align 1 + %cmp19 = icmp eq i8 %4, 2 + br i1 %cmp19, label %land.lhs.true21, label %land.end + +land.lhs.true21: ; preds = %land.lhs.true16 + %arrayidx22 = getelementptr inbounds nuw i8, ptr %p, i64 8 + %5 = load i8, ptr %arrayidx22, align 1 + %cmp24 = icmp eq i8 %5, 7 + br i1 %cmp24, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true21 + %arrayidx26 = getelementptr inbounds nuw i8, ptr %p, i64 14 + %6 = load i8, ptr %arrayidx26, align 1 + %cmp28 = icmp eq i8 %6, 9 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true21, %land.lhs.true16, %land.lhs.true11, %entry + %7 = phi i1 [ false, %land.lhs.true21 ], [ false, %land.lhs.true16 ], [ false, %land.lhs.true11 ], [ false, %entry ], [ %cmp28, %land.rhs ] + ret i1 %7 +} + +; p[12] and p[13] mergable, select mult-block is part of the chain but isn't merged and won't get split up into its single comparisons. + +define dso_local noundef zeroext i1 @partial_merge_not_select(ptr noundef nonnull readonly align 8 captures(none) dereferenceable(24) %p) local_unnamed_addr { +; CHECK-LABEL: @partial_merge_not_select( +; CHECK: entry3: +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i64 10 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[IDX0]], align 1 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[IDX1]], align 1 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[IDX2]], align 1 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i8 [[TMP0]], -1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP1]], -56 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[CMP0]], i1 [[CMP1]], i1 false +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[TMP2]], -66 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[SEL0]], i1 [[CMP2]], i1 false +; CHECK-NEXT: br i1 [[SEL1]], label [[LAND_LHS_LAND_RHS:%.*]], label [[LAND_END:%.*]] +; CHECK: "land.lhs.true11+land.rhs": +; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12 +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[IDX3]], ptr [[MEMCMP_OP]], i64 2) +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: br i1 [[CMP3]], label [[LAND_LHS_16:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true162: +; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 6 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[IDX4]], align 1 +; CHECK-NEXT: [[CMP4:%.*]] = icmp eq i8 [[TMP4]], 2 +; CHECK-NEXT: br i1 [[CMP4]], label [[LAND_LHS_21:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true211: +; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[IDX5]], align 1 +; CHECK-NEXT: [[CMP5:%.*]] = icmp eq i8 [[TMP5]], 7 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ [[CMP5]], [[LAND_LHS_21]] ], [ false, [[LAND_LHS_16]] ], [ false, [[LAND_LHS_LAND_RHS]] ], [ false, %entry3 ] +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 10 + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %p, i64 1 + %1 = load i8, ptr %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p, i64 3 + %2 = load i8, ptr %arrayidx2, align 1 + %cmp = icmp eq i8 %0, -1 + %cmp5 = icmp eq i8 %1, -56 + %or.cond = select i1 %cmp, i1 %cmp5, i1 false + %cmp9 = icmp eq i8 %2, -66 + %or.cond30 = select i1 %or.cond, i1 %cmp9, i1 false + br i1 %or.cond30, label %land.lhs.true11, label %land.end + +land.lhs.true11: ; preds = %entry + %arrayidx12 = getelementptr inbounds nuw i8, ptr %p, i64 12 + %3 = load i8, ptr %arrayidx12, align 1 + %cmp14 = icmp eq i8 %3, 1 + br i1 %cmp14, label %land.lhs.true16, label %land.end + +land.lhs.true16: ; preds = %land.lhs.true11 + %arrayidx17 = getelementptr inbounds nuw i8, ptr %p, i64 6 + %4 = load i8, ptr %arrayidx17, align 1 + %cmp19 = icmp eq i8 %4, 2 + br i1 %cmp19, label %land.lhs.true21, label %land.end + +land.lhs.true21: ; preds = %land.lhs.true16 + %arrayidx22 = getelementptr inbounds nuw i8, ptr %p, i64 8 + %5 = load i8, ptr %arrayidx22, align 1 + %cmp24 = icmp eq i8 %5, 7 + br i1 %cmp24, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true21 + %arrayidx26 = getelementptr inbounds nuw i8, ptr %p, i64 13 + %6 = load i8, ptr %arrayidx26, align 1 + %cmp28 = icmp eq i8 %6, 9 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true21, %land.lhs.true16, %land.lhs.true11, %entry + %7 = phi i1 [ false, %land.lhs.true21 ], [ false, %land.lhs.true16 ], [ false, %land.lhs.true11 ], [ false, %entry ], [ %cmp28, %land.rhs ] + ret i1 %7 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/partial-select-merge.ll b/llvm/test/Transforms/MergeICmps/X86/partial-select-merge.ll new file mode 100644 index 0000000000000..f67743ed6fcc1 --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/partial-select-merge.ll @@ -0,0 +1,178 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s + +; Cannot merge only part of a select block if not entire block mergable. + +define zeroext i1 @cmp_partially_mergable_select( + ptr nocapture readonly align 4 dereferenceable(24) %a, + ptr nocapture readonly align 4 dereferenceable(24) %b) local_unnamed_addr { +; CHECK-LABEL: @cmp_partially_mergable_select( +; CHECK: entry: +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[A:%.*]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[IDX0]], align 4 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[TMP0]], 255 +; CHECK-NEXT: br i1 [[CMP0]], label [[LAND_LHS_TRUE:%.*]], label [[LAND_END:%.*]] +; CHECK: land.lhs.true: +; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[B:%.*]], align 4 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[TMP1]], [[TMP2]] +; CHECK-NEXT: br i1 [[CMP1]], label [[LAND_LHS_TRUE_4:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true4: +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 5 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[IDX1]], align 1 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 5 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[IDX2]], align 1 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 16 +; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[IDX3]], align 4 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i32 [[TMP5]], 100 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[CMP2]], i1 [[CMP3]], i1 false +; CHECK-NEXT: br i1 [[SEL0]], label [[LAND_LHS_TRUE_10:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true10: +; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 20 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[IDX4]], align 4 +; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 20 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, ptr [[IDX5]], align 4 +; CHECK-NEXT: [[CMP4:%.*]] = icmp eq i8 [[TMP6]], [[TMP7]] +; CHECK-NEXT: br i1 [[CMP4]], label [[LAND_RHS:%.*]], label [[LAND_END]] +; CHECK: land.rhs: +; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 4 +; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[IDX6]], align 4 +; CHECK-NEXT: [[IDX7:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 4 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, ptr [[IDX7]], align 4 +; CHECK-NEXT: [[CMP5:%.*]] = icmp eq i8 [[TMP8]], [[TMP9]] +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ false, [[LAND_LHS_TRUE_10]] ], [ false, [[LAND_LHS_TRUE_4]] ], [ false, [[LAND_LHS_TRUE]] ], [ false, %entry ], [ [[CMP5]], [[LAND_RHS]] ] +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + %e = getelementptr inbounds nuw i8, ptr %a, i64 8 + %0 = load i32, ptr %e, align 4 + %cmp = icmp eq i32 %0, 255 + br i1 %cmp, label %land.lhs.true, label %land.end + +land.lhs.true: ; preds = %entry + %1 = load i32, ptr %a, align 4 + %2 = load i32, ptr %b, align 4 + %cmp3 = icmp eq i32 %1, %2 + br i1 %cmp3, label %land.lhs.true4, label %land.end + +land.lhs.true4: ; preds = %land.lhs.true + %c = getelementptr inbounds nuw i8, ptr %a, i64 5 + %3 = load i8, ptr %c, align 1 + %c5 = getelementptr inbounds nuw i8, ptr %b, i64 5 + %4 = load i8, ptr %c5, align 1 + %cmp7 = icmp eq i8 %3, %4 + %g = getelementptr inbounds nuw i8, ptr %a, i64 16 + %5 = load i32, ptr %g, align 4 + %cmp9 = icmp eq i32 %5, 100 + %or.cond = select i1 %cmp7, i1 %cmp9, i1 false + br i1 %or.cond, label %land.lhs.true10, label %land.end + +land.lhs.true10: ; preds = %land.lhs.true4 + %h = getelementptr inbounds nuw i8, ptr %a, i64 20 + %6 = load i8, ptr %h, align 4 + %h12 = getelementptr inbounds nuw i8, ptr %b, i64 20 + %7 = load i8, ptr %h12, align 4 + %cmp14 = icmp eq i8 %6, %7 + br i1 %cmp14, label %land.rhs, label %land.end + +land.rhs: ; preds = %land.lhs.true10 + %b15 = getelementptr inbounds nuw i8, ptr %a, i64 4 + %8 = load i8, ptr %b15, align 4 + %b17 = getelementptr inbounds nuw i8, ptr %b, i64 4 + %9 = load i8, ptr %b17, align 4 + %cmp19 = icmp eq i8 %8, %9 + br label %land.end + +land.end: ; preds = %land.rhs, %land.lhs.true10, %land.lhs.true4, %land.lhs.true, %entry + %10 = phi i1 [ false, %land.lhs.true10 ], [ false, %land.lhs.true4 ], [ false, %land.lhs.true ], [ false, %entry ], [ %cmp19, %land.rhs ] + ret i1 %10 +} + + +; p[12] and p[13] are mergable. p[12] is inside of a select block which will not be split up, so it shouldn't merge them. + +define dso_local zeroext i1 @cmp_partially_mergable_select_array( + ptr nocapture readonly align 1 dereferenceable(24) %p) local_unnamed_addr { +; CHECK-LABEL: @cmp_partially_mergable_select_array( +; CHECK: entry: +; CHECK-NEXT: [[IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P:%.*]], i64 12 +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[IDX0]], align 1 +; CHECK-NEXT: [[IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[IDX1]], align 1 +; CHECK-NEXT: [[IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[IDX2]], align 1 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i8 [[TMP0]], -1 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP1]], -56 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[CMP0]], i1 [[CMP1]], i1 false +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i8 [[TMP2]], -66 +; CHECK-NEXT: [[SEL1:%.*]] = select i1 [[SEL0]], i1 [[CMP2]], i1 false +; CHECK-NEXT: br i1 [[SEL1]], label [[LAND_LHS_TRUE_11:%.*]], label [[LAND_END:%.*]] +; CHECK: land.lhs.true11: +; CHECK-NEXT: [[IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 10 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[IDX3]], align 1 +; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[TMP3]], 1 +; CHECK-NEXT: br i1 [[CMP3]], label [[LAND_LHS_TRUE_16:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true16: +; CHECK-NEXT: [[IDX4:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 6 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[IDX4]], align 1 +; CHECK-NEXT: [[CMP4:%.*]] = icmp eq i8 [[TMP4]], 2 +; CHECK-NEXT: br i1 [[CMP4]], label [[LAND_LHS_TRUE_21:%.*]], label [[LAND_END]] +; CHECK: land.lhs.true21: +; CHECK-NEXT: [[IDX5:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[IDX5]], align 1 +; CHECK-NEXT: [[CMP5:%.*]] = icmp eq i8 [[TMP5]], 7 +; CHECK-NEXT: br i1 [[CMP5]], label [[LAND_RHS:%.*]], label [[LAND_END]] +; CHECK: land.rhs: +; CHECK-NEXT: [[IDX6:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 13 +; CHECK-NEXT: [[TMP6:%.*]] = load i8, ptr [[IDX6]], align 1 +; CHECK-NEXT: [[CMP6:%.*]] = icmp eq i8 [[TMP6]], 9 +; CHECK-NEXT: br label [[LAND_END]] +; CHECK: land.end: +; CHECK-NEXT: [[RES:%.*]] = phi i1 [ false, [[LAND_LHS_TRUE_21]] ], [ false, [[LAND_LHS_TRUE_16]] ], [ false, [[LAND_LHS_TRUE_11]] ], [ false, %entry ], [ [[CMP6]], [[LAND_RHS]] ] +; CHECK-NEXT: ret i1 [[RES]] +; +entry: + %arrayidx = getelementptr inbounds nuw i8, ptr %p, i64 12 + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx1 = getelementptr inbounds nuw i8, ptr %p, i64 1 + %1 = load i8, ptr %arrayidx1, align 1 + %arrayidx2 = getelementptr inbounds nuw i8, ptr %p, i64 3 + %2 = load i8, ptr %arrayidx2, align 1 + %cmp = icmp eq i8 %0, -1 + %cmp5 = icmp eq i8 %1, -56 + %or.cond = select i1 %cmp, i1 %cmp5, i1 false + %cmp9 = icmp eq i8 %2, -66 + %or.cond30 = select i1 %or.cond, i1 %cmp9, i1 false + br i1 %or.cond30, label %land.lhs.true11, label %land.end + +land.lhs.true11: + %arrayidx12 = getelementptr inbounds nuw i8, ptr %p, i64 10 + %3 = load i8, ptr %arrayidx12, align 1 + %cmp14 = icmp eq i8 %3, 1 + br i1 %cmp14, label %land.lhs.true16, label %land.end + +land.lhs.true16: + %arrayidx17 = getelementptr inbounds nuw i8, ptr %p, i64 6 + %4 = load i8, ptr %arrayidx17, align 1 + %cmp19 = icmp eq i8 %4, 2 + br i1 %cmp19, label %land.lhs.true21, label %land.end + +land.lhs.true21: + %arrayidx22 = getelementptr inbounds nuw i8, ptr %p, i64 8 + %5 = load i8, ptr %arrayidx22, align 1 + %cmp24 = icmp eq i8 %5, 7 + br i1 %cmp24, label %land.rhs, label %land.end + +land.rhs: + %arrayidx26 = getelementptr inbounds nuw i8, ptr %p, i64 13 + %6 = load i8, ptr %arrayidx26, align 1 + %cmp28 = icmp eq i8 %6, 9 + br label %land.end + +land.end: + %7 = phi i1 [ false, %land.lhs.true21 ], [ false, %land.lhs.true16 ], [ false, %land.lhs.true11 ], [ false, %entry ], [ %cmp28, %land.rhs ] + ret i1 %7 +} + diff --git a/llvm/test/Transforms/MergeICmps/X86/single-block.ll b/llvm/test/Transforms/MergeICmps/X86/single-block.ll new file mode 100644 index 0000000000000..cd321f435d1f3 --- /dev/null +++ b/llvm/test/Transforms/MergeICmps/X86/single-block.ll @@ -0,0 +1,23 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s + +; Merges adjacent comparisons with constants even if only in single basic block + +define i1 @merge_single(ptr nocapture noundef readonly dereferenceable(2) %p) { +; CHECK-LABEL: @merge_single( +; CHECK: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = alloca <{ i8, i8 }>, align 1 +; CHECK-NEXT: store <{ i8, i8 }> <{ i8 -1, i8 -1 }>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[P]], ptr [[TMP1]], i64 2) +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; CHECK-NEXT: ret i1 [[CMP0]] +; +entry: + %0 = load i8, ptr %p, align 1 + %arrayidx1 = getelementptr inbounds i8, ptr %p, i64 1 + %1 = load i8, ptr %arrayidx1, align 1 + %cmp = icmp eq i8 %0, -1 + %cmp3 = icmp eq i8 %1, -1 + %2 = select i1 %cmp, i1 %cmp3, i1 false + ret i1 %2 +} diff --git a/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll b/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll index c53d86d76ff3b..442d11f9c77fa 100644 --- a/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll +++ b/llvm/test/Transforms/MergeICmps/X86/split-block-does-work.ll @@ -1,9 +1,13 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -passes=mergeicmps -verify-dom-info -mtriple=x86_64-unknown-unknown -S | FileCheck %s --check-prefix=X86 +; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes='mergeicmps,expand-memcmp' -verify-dom-info -S 2>&1 | FileCheck %s --check-prefix=EXPANDED %S = type { i32, i32, i32, i32 } declare void @foo(...) +declare void @bar(...) + +; X86: [[MEMCMP_OP:@memcmp_const_op]] = private constant <{ i8, i8, i8 }> <{ i8 100, i8 3, i8 -56 }> +; EXPANDED-NOT: [[MEMCMP_OP:@memcmp_const_op]] = private constant <{ i8, i8, i8 }> <{ i8 100, i8 3, i8 -56 }> ; We can split %entry and create a memcmp(16 bytes). define zeroext i1 @opeq1( @@ -240,3 +244,87 @@ opeq1.exit: %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ] ret i1 %8 } + +; Call instruction mixed in with select block but doesn't clobber memory, so can safely sink and merge all comparisons. +; Make sure that call order stays the same. +define dso_local noundef zeroext i1 @unclobbered_select_cmp( +; X86-LABEL: @unclobbered_select_cmp( +; X86-NEXT: "entry+land.rhs": +; X86-NEXT: call void (...) @foo() #[[ATTR2]] +; X86-NEXT: call void (...) @bar() #[[ATTR2]] +; X86-NEXT: [[OFFSET:%.*]] = getelementptr inbounds nuw i8, ptr [[A:%.*]], i64 2 +; X86-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[OFFSET]], ptr [[MEMCMP_OP]], i64 3) +; X86-NEXT: [[TMP1:%.*]] = icmp eq i32 [[MEMCMP]], 0 +; X86-NEXT: br label [[LAND_END:%.*]] +; X86: land.end: +; X86-NEXT: ret i1 [[TMP1]] +; + ptr nocapture readonly dereferenceable(5) %a) local_unnamed_addr nofree nosync { +entry: + %q = getelementptr inbounds nuw i8, ptr %a, i64 4 + %0 = load i8, ptr %q, align 1 + call void (...) @foo() inaccessiblememonly + %cmp = icmp eq i8 %0, 200 + %c = getelementptr inbounds nuw i8, ptr %a, i64 2 + %1 = load i8, ptr %c, align 1 + %cmp2 = icmp eq i8 %1, 100 + call void (...) @bar() inaccessiblememonly + %or.cond = select i1 %cmp, i1 %cmp2, i1 false + br i1 %or.cond, label %land.rhs, label %land.end + +land.rhs: ; preds = %entry + %b3 = getelementptr inbounds nuw i8, ptr %a, i64 3 + %2 = load i8, ptr %b3, align 1 + %cmp5 = icmp eq i8 %2, 3 + br label %land.end + +land.end: ; preds = %land.rhs, %entry + %3 = phi i1 [ false, %entry ], [ %cmp5, %land.rhs ] + ret i1 %3 +} + + +; Can only split first block. If subsequent block contains a clobber instruction then don't merge. +define dso_local noundef zeroext i1 @not_split_sec_block( +; X86-LABEL: @not_split_sec_block( +; X86-NEXT: entry: +; X86-NEXT: [[TMP0:%.*]] = load i8, ptr [[A:%.*]], align 1 +; X86-NEXT: call void (...) @foo() #[[ATTR2]] +; X86-NEXT: [[CMP0:%.*]] = icmp eq i8 [[TMP0]], -56 +; X86-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 2 +; X86-NEXT: [[TMP2:%.*]] = load i8, ptr [[TMP1]], align 1 +; X86-NEXT: [[CMP1:%.*]] = icmp eq i8 [[TMP2]], 100 +; X86-NEXT: [[SEL0:%.*]] = select i1 [[CMP0]], i1 [[CMP1]], i1 false +; X86-NEXT: br i1 [[SEL0]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]] +; X86: land.rhs: +; X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 1 +; X86-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1 +; X86-NEXT: call void (...) @bar() #[[ATTR2]] +; X86-NEXT: [[CMP2:%.*]] = icmp eq i8 [[TMP4]], 3 +; X86-NEXT: br label [[LAND_END]] +; X86: land.end: +; X86-NEXT: [[RES:%.*]] = phi i1 [ false, %entry ], [ [[CMP2]], [[LAND_RHS]] ] +; X86-NEXT: ret i1 [[RES]] +; + ptr nocapture readonly dereferenceable(3) %a) local_unnamed_addr nofree nosync { +entry: + %0 = load i8, ptr %a, align 1 + call void (...) @foo() inaccessiblememonly + %cmp = icmp eq i8 %0, 200 + %c = getelementptr inbounds nuw i8, ptr %a, i64 2 + %1 = load i8, ptr %c, align 1 + %cmp2 = icmp eq i8 %1, 100 + %or.cond = select i1 %cmp, i1 %cmp2, i1 false + br i1 %or.cond, label %land.rhs, label %land.end + +land.rhs: ; preds = %entry + %b3 = getelementptr inbounds nuw i8, ptr %a, i64 1 + %2 = load i8, ptr %b3, align 1 +; Even though this call doesn't clobber any memory, can only sink instructions from first block. + call void (...) @bar() inaccessiblememonly + %cmp5 = icmp eq i8 %2, 3 + br label %land.end +land.end: + %3 = phi i1 [ false, %entry ], [ %cmp5, %land.rhs ] + ret i1 %3 +}