From cfb9991c0812cde4afc3dc1b84221814a0e8b0ff Mon Sep 17 00:00:00 2001 From: dianqk Date: Sun, 4 May 2025 20:15:14 +0800 Subject: [PATCH 1/4] Pre-commit test cases --- llvm/test/Transforms/MemCpyOpt/memcpy-load.ll | 96 +++++++++++++++++++ .../PhaseOrdering/pr137810-forward-load.ll | 68 +++++++++++++ 2 files changed, 164 insertions(+) create mode 100644 llvm/test/Transforms/MemCpyOpt/memcpy-load.ll create mode 100644 llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll new file mode 100644 index 0000000000000..79f62cdbfdab4 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll @@ -0,0 +1,96 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s + +define i24 @forward_load(ptr %src) { +; CHECK-LABEL: define i24 @forward_load( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false) +; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[DEST]], align 4 +; CHECK-NEXT: ret i24 [[VAL1]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + %val = load i24, ptr %dest + ret i24 %val +} + +define i16 @forward_load_2(ptr %src) { +; CHECK-LABEL: define i16 @forward_load_2( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 2, i1 false) +; CHECK-NEXT: [[VAL1:%.*]] = load i16, ptr [[DEST]], align 2 +; CHECK-NEXT: ret i16 [[VAL1]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + %val = load i16, ptr %dest + ret i16 %val +} + +define i32 @forward_load_padding(ptr %src) { +; CHECK-LABEL: define i32 @forward_load_padding( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca { i8, i32 }, align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 8, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4 +; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: ret i32 [[VAL1]] +; + %dest = alloca { i8, i32 } + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false) + %gep = getelementptr inbounds i8, ptr %dest, i64 4 + %val = load i32, ptr %gep + ret i32 %val +} + +; Negative tests + +define i24 @failed_forward_load_write_src(ptr %src) { +; CHECK-LABEL: define i24 @failed_forward_load_write_src( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false) +; CHECK-NEXT: store i1 true, ptr [[SRC]], align 1 +; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4 +; CHECK-NEXT: ret i24 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + store i1 true, ptr %src + %val = load i24, ptr %dest + ret i24 %val +} + +define i16 @failed_forward_load_size(ptr %src) { +; CHECK-LABEL: define i16 @failed_forward_load_size( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 1, i1 false) +; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[DEST]], align 2 +; CHECK-NEXT: ret i16 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) + %val = load i16, ptr %dest + ret i16 %val +} + +define i32 @failed_forward_load_padding(ptr %src) { +; CHECK-LABEL: define i32 @failed_forward_load_padding( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca { i8, i32 }, align 8 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 5, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4 +; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: ret i32 [[VAL]] +; + %dest = alloca { i8, i32 } + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 5, i1 false) + %gep = getelementptr inbounds i8, ptr %dest, i64 4 + %val = load i32, ptr %gep + ret i32 %val +} + +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll new file mode 100644 index 0000000000000..224258530ecc0 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -O2 -S < %s | FileCheck %s + +; FIXME: It can return true. +define i1 @main(ptr %i2) { +; CHECK-LABEL: define noundef i1 @main( +; CHECK-SAME: ptr captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[I1:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: store i8 0, ptr [[I2]], align 1 +; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1 +; CHECK-NEXT: store i8 1, ptr [[I3]], align 1 +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2 +; CHECK-NEXT: store i8 2, ptr [[I4]], align 1 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false) +; CHECK-NEXT: [[I51:%.*]] = load i8, ptr [[I1]], align 1 +; CHECK-NEXT: [[I6:%.*]] = icmp eq i8 [[I51]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 1 +; CHECK-NEXT: [[I82:%.*]] = load i8, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[I9:%.*]] = icmp eq i8 [[I82]], 1 +; CHECK-NEXT: [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 2 +; CHECK-NEXT: [[I123:%.*]] = load i8, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[I13:%.*]] = icmp eq i8 [[I123]], 2 +; CHECK-NEXT: [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false +; CHECK-NEXT: br i1 [[I14]], label %[[TRUE:.*]], label %[[FALSE:.*]] +; CHECK: [[COMMON_RET:.*]]: +; CHECK-NEXT: ret i1 [[I14]] +; CHECK: [[TRUE]]: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 3, ptr nonnull [[I1]]) +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[FALSE]]: +; CHECK-NEXT: call void @assert_failed(ptr nonnull [[I1]]) +; CHECK-NEXT: br label %[[COMMON_RET]] +; + %i1 = alloca [3 x i8], align 1 + store i8 0, ptr %i2, align 1 + %i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1 + store i8 1, ptr %i3, align 1 + %i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2 + store i8 2, ptr %i4, align 1 + call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1) + call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false) + %i5 = load i8, ptr %i1, align 1 + %i6 = icmp eq i8 %i5, 0 + %i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1 + %i8 = load i8, ptr %i7, align 1 + %i9 = icmp eq i8 %i8, 1 + %i10 = select i1 %i6, i1 %i9, i1 false + %i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2 + %i12 = load i8, ptr %i11, align 1 + %i13 = icmp eq i8 %i12, 2 + %i14 = select i1 %i10, i1 %i13, i1 false + br i1 %i14, label %true, label %false + +true: + call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1) + ret i1 true + +false: + call void @assert_failed(ptr %i1) + ret i1 false +} + +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) +declare void @llvm.lifetime.start.p0(i64, ptr) +declare void @llvm.lifetime.end.p0(i64, ptr) +declare void @assert_failed(ptr) From 0fd9da5b5a0c53a90181451f63e0f6fd90b218f8 Mon Sep 17 00:00:00 2001 From: dianqk Date: Sun, 4 May 2025 20:55:34 +0800 Subject: [PATCH 2/4] [MemCpyOpt] Forward memcpy source to load instruction --- .../llvm/Transforms/Scalar/MemCpyOptimizer.h | 14 +- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 275 +++++++++++------- llvm/test/Transforms/MemCpyOpt/memcpy-load.ll | 7 +- llvm/test/Transforms/MemCpyOpt/memcpy.ll | 6 +- llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll | 2 +- .../PhaseOrdering/pr137810-forward-load.ll | 6 +- 6 files changed, 191 insertions(+), 119 deletions(-) diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h index 496d2958fc2d0..d1369ae918959 100644 --- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h +++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h @@ -14,6 +14,7 @@ #ifndef LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H #define LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H +#include "llvm/Analysis/MemoryLocation.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/PassManager.h" @@ -64,21 +65,28 @@ class MemCpyOptPass : public PassInfoMixin { private: // Helper functions bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); + bool processLoad(LoadInst *LI, BasicBlock::iterator &BBI, + SmallVectorImpl &NewInsts); bool processStoreOfLoad(StoreInst *SI, LoadInst *LI, const DataLayout &DL, BasicBlock::iterator &BBI); bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); - bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI); + bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI, + SmallVectorImpl &NewInsts); bool processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI); bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore, Value *cpyDst, Value *cpySrc, TypeSize cpyLen, Align cpyAlign, BatchAAResults &BAA, std::function GetC); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, - BatchAAResults &BAA); + BatchAAResults &BAA, + SmallVectorImpl &NewInsts); bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet, BatchAAResults &BAA); bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet, BatchAAResults &BAA); + bool findNewSrc(MemCpyInst *MDep, Instruction *UseInstr, BatchAAResults &BAA, + Value *&NewSrc, MaybeAlign &NewAlign, + SmallVectorImpl &NewInsts); bool processByValArgument(CallBase &CB, unsigned ArgNo); bool processImmutArgument(CallBase &CB, unsigned ArgNo); Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, @@ -90,7 +98,7 @@ class MemCpyOptPass : public PassInfoMixin { bool isMemMoveMemSetDependency(MemMoveInst *M); void eraseInstruction(Instruction *I); - bool iterateOnFunction(Function &F); + bool iterateOnFunction(Function &F, SmallVectorImpl &NewInsts); }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 6407f48dc2c05..1dfa6bc787278 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -74,6 +74,7 @@ STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); STATISTIC(NumCallSlot, "Number of call slot optimizations performed"); STATISTIC(NumStackMove, "Number of stack-move optimizations performed"); +STATISTIC(NumLoadInstr, "Number of load instruction optimizations performed"); namespace { @@ -739,6 +740,145 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, return false; } +bool MemCpyOptPass::findNewSrc(MemCpyInst *MDep, Instruction *UseInstr, + BatchAAResults &BAA, Value *&NewSrc, + MaybeAlign &NewAlign, + SmallVectorImpl &NewInsts) { + auto *MemCpy = dyn_cast(UseInstr); + auto *LoadI = dyn_cast(UseInstr); + MemoryLocation UseLoc; + Value *OldSrc; + if (MemCpy) { + UseLoc = MemoryLocation::getForSource(MemCpy); + OldSrc = MemCpy->getSource(); + } else if (LoadI) { + UseLoc = MemoryLocation::get(LoadI); + OldSrc = LoadI->getPointerOperand(); + } else + return false; + uint64_t UseLen = 0; + if (UseLoc.Size.hasValue()) + UseLen = UseLoc.Size.getValue().getKnownMinValue(); + // If dep instruction is reading from our current input, then it is a noop + // transfer and substituting the input won't change this instruction. Just + // ignore the input and let someone else zap MDep. This handles cases like: + // memcpy(a <- a) + // memcpy(b <- a) + if (OldSrc == MDep->getSource()) + return false; + + // We can only optimize non-volatile memcpy's. + if (MDep->isVolatile()) + return false; + + int64_t MForwardOffset = 0; + const DataLayout &DL = MDep->getDataLayout(); + // We can only transforms memcpy's where the dest of one is the source of the + // other, or they have an offset in a range. + if (OldSrc != MDep->getDest()) { + std::optional Offset = + OldSrc->getPointerOffsetFrom(MDep->getDest(), DL); + if (!Offset || *Offset < 0) + return false; + MForwardOffset = *Offset; + } + + // The length of the memcpy's must be the same, or the preceding one + // must be larger than the following one. + if (MForwardOffset != 0 || LoadI || + (MemCpy && MDep->getLength() != MemCpy->getLength())) { + auto *MDepLen = dyn_cast(MDep->getLength()); + if (UseLen == 0 || !MDepLen || + MDepLen->getZExtValue() < UseLen + MForwardOffset) + return false; + } + IRBuilder<> Builder(UseInstr); + NewSrc = MDep->getSource(); + NewAlign = MDep->getSourceAlign(); + // We just need to calculate the actual size of the copy. + auto MCopyLoc = + MemoryLocation::getForSource(MDep).getWithNewSize(UseLoc.Size); + + // When the forwarding offset is greater than 0, we transform + // memcpy(d1 <- s1) + // memcpy(d2 <- d1+o) + // to + // memcpy(d2 <- s1+o) + if (MForwardOffset > 0) { + // The copy destination of `M` maybe can serve as the source of copying. + if (MemCpy && (MForwardOffset == MemCpy->getRawDest()->getPointerOffsetFrom( + MDep->getRawSource(), DL))) { + NewSrc = cast(UseInstr)->getDest(); + } else { + NewSrc = Builder.CreateInBoundsPtrAdd(NewSrc, + Builder.getInt64(MForwardOffset)); + if (Instruction *NewI = dyn_cast(NewSrc)) + NewInsts.push_back(NewI); + } + // We need to update `MCopyLoc` if an offset exists. + MCopyLoc = MCopyLoc.getWithNewPtr(NewSrc); + if (NewAlign) + NewAlign = commonAlignment(*NewAlign, MForwardOffset); + } + + // Avoid infinite loops + if (BAA.isMustAlias(OldSrc, NewSrc)) + return false; + // Verify that the copied-from memory doesn't change in between the two + // transfers. For example, in: + // memcpy(a <- b) + // *b = 42; + // memcpy(c <- a) + // It would be invalid to transform the second memcpy into memcpy(c <- b). + // + // TODO: If the code between M and MDep is transparent to the destination "c", + // then we could still perform the xform by moving M up to the first memcpy. + if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep), + MSSA->getMemoryAccess(UseInstr))) + return false; + return true; +} + +/// Perform simplification of load's. If we have memcpy A which copies X to Y, +/// and load instruction B which loads from Y, then we can rewrite B to be a +/// load instruction loads from X. This allows later passes to remove the memcpy +/// A or identify the source of the load instruction. +bool MemCpyOptPass::processLoad(LoadInst *LI, BasicBlock::iterator &BBI, + SmallVectorImpl &NewInsts) { + if (!LI->isSimple()) + return false; + MemoryUseOrDef *MA = MSSA->getMemoryAccess(LI); + if (!MA) + return false; + BatchAAResults BAA(*AA, EEA); + + MemoryAccess *AnyClobber = MA->getDefiningAccess(); + const MemoryAccess *DestClobber = + MSSA->getWalker()->getClobberingMemoryAccess( + AnyClobber, MemoryLocation::get(LI), BAA); + MemCpyInst *MDep = nullptr; + if (auto *MD = dyn_cast(DestClobber)) + if (Instruction *MI = MD->getMemoryInst()) + MDep = dyn_cast(MI); + + if (!MDep) + return false; + + Value *NewSrc; + MaybeAlign NewAlign; + if (!findNewSrc(MDep, LI, BAA, NewSrc, NewAlign, NewInsts)) + return false; + IRBuilder<> Builder(LI); + Instruction *NewLI = + Builder.CreateAlignedLoad(LI->getType(), NewSrc, NewAlign, LI->getName()); + auto *NewAccess = MSSAU->createMemoryAccessAfter(NewLI, nullptr, MA); + MSSAU->insertUse(cast(NewAccess), /*RenameUses=*/true); + LI->replaceAllUsesWith(NewLI); + eraseInstruction(LI); + ++NumLoadInstr; + return true; +} + bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; @@ -1101,101 +1241,18 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, /// We've found that the (upward scanning) memory dependence of memcpy 'M' is /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can. -bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, - MemCpyInst *MDep, - BatchAAResults &BAA) { - // If dep instruction is reading from our current input, then it is a noop - // transfer and substituting the input won't change this instruction. Just - // ignore the input and let someone else zap MDep. This handles cases like: - // memcpy(a <- a) - // memcpy(b <- a) - if (M->getSource() == MDep->getSource()) - return false; - - // We can only optimize non-volatile memcpy's. - if (MDep->isVolatile()) +bool MemCpyOptPass::processMemCpyMemCpyDependence( + MemCpyInst *M, MemCpyInst *MDep, BatchAAResults &BAA, + SmallVectorImpl &NewInsts) { + Value *NewSrc; + MaybeAlign NewAlign; + if (!findNewSrc(MDep, M, BAA, NewSrc, NewAlign, NewInsts)) return false; - int64_t MForwardOffset = 0; - const DataLayout &DL = M->getModule()->getDataLayout(); - // We can only transforms memcpy's where the dest of one is the source of the - // other, or they have an offset in a range. - if (M->getSource() != MDep->getDest()) { - std::optional Offset = - M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL); - if (!Offset || *Offset < 0) - return false; - MForwardOffset = *Offset; - } - - // The length of the memcpy's must be the same, or the preceding one - // must be larger than the following one. - if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) { - auto *MDepLen = dyn_cast(MDep->getLength()); - auto *MLen = dyn_cast(M->getLength()); - if (!MDepLen || !MLen || - MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) - return false; - } - IRBuilder<> Builder(M); - auto *CopySource = MDep->getSource(); - Instruction *NewCopySource = nullptr; - auto CleanupOnRet = llvm::make_scope_exit([&] { - if (NewCopySource && NewCopySource->use_empty()) - // Safety: It's safe here because we will only allocate more instructions - // after finishing all BatchAA queries, but we have to be careful if we - // want to do something like this in another place. Then we'd probably - // have to delay instruction removal until all transforms on an - // instruction finished. - eraseInstruction(NewCopySource); - }); - MaybeAlign CopySourceAlign = MDep->getSourceAlign(); - // We just need to calculate the actual size of the copy. - auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize( - MemoryLocation::getForSource(M).Size); - - // When the forwarding offset is greater than 0, we transform - // memcpy(d1 <- s1) - // memcpy(d2 <- d1+o) - // to - // memcpy(d2 <- s1+o) - if (MForwardOffset > 0) { - // The copy destination of `M` maybe can serve as the source of copying. - std::optional MDestOffset = - M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL); - if (MDestOffset == MForwardOffset) - CopySource = M->getDest(); - else { - CopySource = Builder.CreateInBoundsPtrAdd( - CopySource, Builder.getInt64(MForwardOffset)); - NewCopySource = dyn_cast(CopySource); - } - // We need to update `MCopyLoc` if an offset exists. - MCopyLoc = MCopyLoc.getWithNewPtr(CopySource); - if (CopySourceAlign) - CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset); - } - - // Avoid infinite loops - if (BAA.isMustAlias(M->getSource(), CopySource)) - return false; - - // Verify that the copied-from memory doesn't change in between the two - // transfers. For example, in: - // memcpy(a <- b) - // *b = 42; - // memcpy(c <- a) - // It would be invalid to transform the second memcpy into memcpy(c <- b). - // - // TODO: If the code between M and MDep is transparent to the destination "c", - // then we could still perform the xform by moving M up to the first memcpy. - if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep), - MSSA->getMemoryAccess(M))) - return false; // No need to create `memcpy(a <- a)`. - if (BAA.isMustAlias(M->getDest(), CopySource)) { + if (BAA.isMustAlias(M->getDest(), NewSrc)) { // Remove the instruction we're replacing. eraseInstruction(M); ++NumMemCpyInstr; @@ -1226,20 +1283,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // example we could be moving from movaps -> movq on x86. Instruction *NewM; if (UseMemMove) - NewM = - Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource, - CopySourceAlign, M->getLength(), M->isVolatile()); + NewM = Builder.CreateMemMove(M->getDest(), M->getDestAlign(), NewSrc, + NewAlign, M->getLength(), M->isVolatile()); else if (isa(M)) { // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is // never allowed since that would allow the latter to be lowered as a call // to an external function. - NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), - CopySource, CopySourceAlign, - M->getLength(), M->isVolatile()); - } else NewM = - Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource, - CopySourceAlign, M->getLength(), M->isVolatile()); + Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), NewSrc, + NewAlign, M->getLength(), M->isVolatile()); + } else + NewM = Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), NewSrc, + NewAlign, M->getLength(), M->isVolatile()); NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID); assert(isa(MSSA->getMemoryAccess(M))); @@ -1703,7 +1758,8 @@ static bool isZeroSize(Value *Size) { /// B to be a memcpy from X to Z (or potentially a memmove, depending on /// circumstances). This allows later passes to remove the first memcpy /// altogether. -bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { +bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI, + SmallVectorImpl &NewInsts) { // We can only optimize non-volatile memcpy's. if (M->isVolatile()) return false; @@ -1791,7 +1847,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) { } } if (auto *MDep = dyn_cast(MI)) - if (processMemCpyMemCpyDependence(M, MDep, BAA)) + if (processMemCpyMemCpyDependence(M, MDep, BAA, NewInsts)) return true; if (auto *MDep = dyn_cast(MI)) { if (performMemCpyToMemSetOptzn(M, MDep, BAA)) { @@ -2096,7 +2152,8 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) { } /// Executes one iteration of MemCpyOptPass. -bool MemCpyOptPass::iterateOnFunction(Function &F) { +bool MemCpyOptPass::iterateOnFunction( + Function &F, SmallVectorImpl &NewInsts) { bool MadeChange = false; // Walk all instruction in the function. @@ -2114,12 +2171,14 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) { bool RepeatInstruction = false; - if (auto *SI = dyn_cast(I)) + if (auto *LI = dyn_cast(I)) + MadeChange |= processLoad(LI, BI, NewInsts); + else if (auto *SI = dyn_cast(I)) MadeChange |= processStore(SI, BI); else if (auto *M = dyn_cast(I)) RepeatInstruction = processMemSet(M, BI); else if (auto *M = dyn_cast(I)) - RepeatInstruction = processMemCpy(M, BI); + RepeatInstruction = processMemCpy(M, BI, NewInsts); else if (auto *M = dyn_cast(I)) RepeatInstruction = processMemMove(M, BI); else if (auto *CB = dyn_cast(I)) { @@ -2176,13 +2235,19 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_, MSSAU = &MSSAU_; EarliestEscapeAnalysis EEA_(*DT); EEA = &EEA_; + SmallVector NewInsts; while (true) { - if (!iterateOnFunction(F)) + if (!iterateOnFunction(F, NewInsts)) break; MadeChange = true; } + for (auto *I : NewInsts) { + if (I->use_empty()) + eraseInstruction(I); + } + if (VerifyMemorySSA) MSSA_->verifyMemorySSA(); diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll index 79f62cdbfdab4..462e03f22c2f1 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll @@ -6,7 +6,7 @@ define i24 @forward_load(ptr %src) { ; CHECK-SAME: ptr [[SRC:%.*]]) { ; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false) -; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[DEST]], align 4 +; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4 ; CHECK-NEXT: ret i24 [[VAL1]] ; %dest = alloca [3 x i8] @@ -20,7 +20,7 @@ define i16 @forward_load_2(ptr %src) { ; CHECK-SAME: ptr [[SRC:%.*]]) { ; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 2, i1 false) -; CHECK-NEXT: [[VAL1:%.*]] = load i16, ptr [[DEST]], align 2 +; CHECK-NEXT: [[VAL1:%.*]] = load i16, ptr [[SRC]], align 2 ; CHECK-NEXT: ret i16 [[VAL1]] ; %dest = alloca [3 x i8] @@ -35,7 +35,8 @@ define i32 @forward_load_padding(ptr %src) { ; CHECK-NEXT: [[DEST:%.*]] = alloca { i8, i32 }, align 8 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 8, i1 false) ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4 -; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr [[GEP]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4 +; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr [[TMP1]], align 4 ; CHECK-NEXT: ret i32 [[VAL1]] ; %dest = alloca { i8, i32 } diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll index 89d8eb1ee6711..066325086b7f0 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -229,10 +229,8 @@ define void @test4_write_between(ptr %P) { define i8 @test4_read_between(ptr %P) { ; CHECK-LABEL: @test4_read_between( -; CHECK-NEXT: [[A1:%.*]] = alloca [[TMP1:%.*]], align 8 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A1]], ptr align 4 [[P:%.*]], i64 8, i1 false) -; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[A1]], align 1 -; CHECK-NEXT: call void @test4a(ptr byval(i8) align 1 [[P]]) +; CHECK-NEXT: [[X:%.*]] = load i8, ptr [[A1:%.*]], align 4 +; CHECK-NEXT: call void @test4a(ptr byval(i8) align 1 [[A1]]) ; CHECK-NEXT: ret i8 [[X]] ; %a1 = alloca %1 diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll index 5e13432746bf7..51689cc6fd452 100644 --- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll +++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll @@ -20,7 +20,7 @@ define i32 @foo(i1 %z) { ; CHECK-NEXT: br label [[FOR_INC7_1]] ; CHECK: for.inc7.1: ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[SCEVGEP]], i64 4, i1 false) -; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[SCEVGEP]], align 4 ; CHECK-NEXT: ret i32 [[TMP2]] ; entry: diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll index 224258530ecc0..006f15a31c4e1 100644 --- a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll +++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll @@ -13,13 +13,13 @@ define i1 @main(ptr %i2) { ; CHECK-NEXT: store i8 2, ptr [[I4]], align 1 ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]]) ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false) -; CHECK-NEXT: [[I51:%.*]] = load i8, ptr [[I1]], align 1 +; CHECK-NEXT: [[I51:%.*]] = load i8, ptr [[I2]], align 1 ; CHECK-NEXT: [[I6:%.*]] = icmp eq i8 [[I51]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1 ; CHECK-NEXT: [[I82:%.*]] = load i8, ptr [[TMP1]], align 1 ; CHECK-NEXT: [[I9:%.*]] = icmp eq i8 [[I82]], 1 ; CHECK-NEXT: [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I1]], i64 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2 ; CHECK-NEXT: [[I123:%.*]] = load i8, ptr [[TMP2]], align 1 ; CHECK-NEXT: [[I13:%.*]] = icmp eq i8 [[I123]], 2 ; CHECK-NEXT: [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false From 0fb1679edc6477fea32e52a33c2bf92c6c57c26c Mon Sep 17 00:00:00 2001 From: dianqk Date: Sun, 4 May 2025 20:55:44 +0800 Subject: [PATCH 3/4] [InstCombine] Accumulate the limit only on the instructions that require scanning --- llvm/lib/Analysis/Loads.cpp | 14 +++++- .../Coroutines/coro-retcon-resume-values.ll | 9 ++-- .../JumpThreading/unreachable-loops.ll | 8 ++- .../LowerMatrixIntrinsics/multiply-fused.ll | 24 +++------ .../early-arg-attrs-inference.ll | 2 +- .../PhaseOrdering/pr137810-forward-load.ll | 27 ++-------- .../SLPVectorizer/revec-shufflevector.ll | 5 +- .../SampleProfile/pseudo-probe-instcombine.ll | 50 +++++++++++++++++++ 8 files changed, 85 insertions(+), 54 deletions(-) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index b461c41d29e84..a8d45bbbe2974 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -539,6 +539,16 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr, return LoadRange.intersectWith(StoreRange).isEmptySet(); } +static bool maybeAvailableLoadStore(Instruction *Inst) { + switch (Inst->getOpcode()) { + case Instruction::Load: + case Instruction::Store: + return true; + default: + return isa(Inst); + } +} + static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr, Type *AccessTy, bool AtLeastAtomic, const DataLayout &DL, bool *IsLoadCSE) { @@ -653,7 +663,7 @@ Value *llvm::findAvailablePtrLoadStore( ++(*NumScanedInst); // Don't scan huge blocks. - if (MaxInstsToScan-- == 0) + if (maybeAvailableLoadStore(Inst) && MaxInstsToScan-- == 0) return nullptr; --ScanFrom; @@ -734,7 +744,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, if (Inst.isDebugOrPseudoInst()) continue; - if (MaxInstsToScan-- == 0) + if (maybeAvailableLoadStore(&Inst) && MaxInstsToScan-- == 0) return nullptr; Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll index 907d7e588ffe0..bf78174533d5a 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll @@ -38,15 +38,18 @@ define i32 @main() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = tail call ptr @allocate(i32 12) ; CHECK-NEXT: store i32 1, ptr [[TMP0]], align 4 +; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]]) ; CHECK-NEXT: [[N_VAL3_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4 -; CHECK-NEXT: store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0:![0-9]+]] +; CHECK-NEXT: store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0]] ; CHECK-NEXT: [[INPUT_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: store i32 2, ptr [[INPUT_SPILL_ADDR_I]], align 4, !noalias [[META0]] +; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) ; CHECK-NEXT: [[INPUT_RELOAD_ADDR13_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4 -; CHECK-NEXT: store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3:![0-9]+]] +; CHECK-NEXT: store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3]] ; CHECK-NEXT: store i32 4, ptr [[INPUT_RELOAD_ADDR13_I]], align 4, !noalias [[META3]] -; CHECK-NEXT: tail call void @print(i32 7), !noalias [[META6:![0-9]+]] +; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) +; CHECK-NEXT: tail call void @print(i32 7), !noalias [[META6]] ; CHECK-NEXT: tail call void @deallocate(ptr nonnull [[TMP0]]), !noalias [[META6]] ; CHECK-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll index 79c5e9217312d..f4d5fc9a26728 100644 --- a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll +++ b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll @@ -191,11 +191,8 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) { ; CHECK-LABEL: @constant_phi_leads_to_self_reference( ; CHECK-NEXT: [[A9:%.*]] = alloca i1, align 1 ; CHECK-NEXT: br label [[F6:%.*]] -; CHECK: T3: +; CHECK: BB5.thread: ; CHECK-NEXT: br label [[BB5:%.*]] -; CHECK: BB5: -; CHECK-NEXT: [[L10:%.*]] = load i1, ptr [[A9]], align 1 -; CHECK-NEXT: br i1 [[L10]], label [[BB6:%.*]], label [[F6]] ; CHECK: BB6: ; CHECK-NEXT: [[LGV3:%.*]] = load i1, ptr [[PTR:%.*]], align 1 ; CHECK-NEXT: [[C4:%.*]] = icmp sle i1 [[C4]], true @@ -204,7 +201,8 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) { ; CHECK: F6: ; CHECK-NEXT: ret i32 0 ; CHECK: F7: -; CHECK-NEXT: br label [[BB5]] +; CHECK-NEXT: [[L10_PR:%.*]] = load i1, ptr [[A9]], align 1 +; CHECK-NEXT: br i1 [[L10_PR]], label [[BB5]], label [[F6]] ; %A9 = alloca i1, align 1 br i1 false, label %BB4, label %F6 diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll index 155f7755c2095..6cbbb534b98b0 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll @@ -263,21 +263,17 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C) ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[VEC_GEP34:%.*]] = getelementptr i8, ptr [[C]], i64 32 ; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[VEC_GEP34]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 16 -; CHECK-NEXT: [[COL_LOAD35:%.*]] = load <2 x double>, ptr [[TMP10]], align 8 -; CHECK-NEXT: [[VEC_GEP36:%.*]] = getelementptr i8, ptr [[A]], i64 48 -; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[VEC_GEP36]], align 8 ; CHECK-NEXT: [[COL_LOAD38:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[VEC_GEP39:%.*]] = getelementptr i8, ptr [[A]], i64 32 ; CHECK-NEXT: [[COL_LOAD40:%.*]] = load <2 x double>, ptr [[VEC_GEP39]], align 8 ; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT43]] +; CHECK-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT43]] ; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP10]]) ; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT49]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT49]] ; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]]) ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[A]], i64 80 ; CHECK-NEXT: [[COL_LOAD53:%.*]] = load <2 x double>, ptr [[TMP15]], align 8 ; CHECK-NEXT: [[VEC_GEP54:%.*]] = getelementptr i8, ptr [[A]], i64 112 @@ -313,22 +309,18 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C) ; CHECK-NEXT: [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT88]] ; CHECK-NEXT: [[SPLAT_SPLAT91:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT91]], <2 x double> [[TMP25]]) -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[A]], i64 64 -; CHECK-NEXT: [[COL_LOAD92:%.*]] = load <2 x double>, ptr [[TMP27]], align 8 -; CHECK-NEXT: [[VEC_GEP93:%.*]] = getelementptr i8, ptr [[A]], i64 96 -; CHECK-NEXT: [[COL_LOAD94:%.*]] = load <2 x double>, ptr [[VEC_GEP93]], align 8 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[A]], i64 80 ; CHECK-NEXT: [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[TMP28]], align 8 ; CHECK-NEXT: [[VEC_GEP96:%.*]] = getelementptr i8, ptr [[A]], i64 112 ; CHECK-NEXT: [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[VEC_GEP96]], align 8 ; CHECK-NEXT: [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]]) +; CHECK-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]]) ; CHECK-NEXT: [[SPLAT_SPLAT104:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP29]]) +; CHECK-NEXT: [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP27]]) ; CHECK-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]]) +; CHECK-NEXT: [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]]) ; CHECK-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP31]]) +; CHECK-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP29]]) ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[C]], i64 64 ; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP33]], align 8 ; CHECK-NEXT: [[VEC_GEP112:%.*]] = getelementptr i8, ptr [[C]], i64 96 diff --git a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll index 93a8c803aba37..b14d5e590ae7b 100644 --- a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll +++ b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll @@ -3,7 +3,7 @@ define i32 @f(ptr noalias %p, i32 %c) { ; CHECK-LABEL: define noundef i32 @f -; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr { +; CHECK-SAME: (ptr noalias readnone captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr { ; CHECK-NEXT: tail call void @g() ; CHECK-NEXT: tail call void @g() ; CHECK-NEXT: tail call void @g() diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll index 006f15a31c4e1..98eb90c184d74 100644 --- a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll +++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll @@ -1,37 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -O2 -S < %s | FileCheck %s -; FIXME: It can return true. define i1 @main(ptr %i2) { ; CHECK-LABEL: define noundef i1 @main( -; CHECK-SAME: ptr captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr { -; CHECK-NEXT: [[I1:%.*]] = alloca [3 x i8], align 1 +; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TRUE:.*:]] ; CHECK-NEXT: store i8 0, ptr [[I2]], align 1 ; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1 ; CHECK-NEXT: store i8 1, ptr [[I3]], align 1 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2 ; CHECK-NEXT: store i8 2, ptr [[I4]], align 1 -; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false) -; CHECK-NEXT: [[I51:%.*]] = load i8, ptr [[I2]], align 1 -; CHECK-NEXT: [[I6:%.*]] = icmp eq i8 [[I51]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1 -; CHECK-NEXT: [[I82:%.*]] = load i8, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[I9:%.*]] = icmp eq i8 [[I82]], 1 -; CHECK-NEXT: [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2 -; CHECK-NEXT: [[I123:%.*]] = load i8, ptr [[TMP2]], align 1 -; CHECK-NEXT: [[I13:%.*]] = icmp eq i8 [[I123]], 2 -; CHECK-NEXT: [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false -; CHECK-NEXT: br i1 [[I14]], label %[[TRUE:.*]], label %[[FALSE:.*]] -; CHECK: [[COMMON_RET:.*]]: -; CHECK-NEXT: ret i1 [[I14]] -; CHECK: [[TRUE]]: -; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 3, ptr nonnull [[I1]]) -; CHECK-NEXT: br label %[[COMMON_RET]] -; CHECK: [[FALSE]]: -; CHECK-NEXT: call void @assert_failed(ptr nonnull [[I1]]) -; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK-NEXT: ret i1 true ; %i1 = alloca [3 x i8], align 1 store i8 0, ptr %i2, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll index b85c78ec8d2d0..d91dfc01649bc 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll @@ -231,14 +231,13 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) { ; COMBINE-NEXT: [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]] ; COMBINE-NEXT: [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32 ; COMBINE-NEXT: [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128 -; COMBINE-NEXT: [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; COMBINE-NEXT: store <32 x float> [[TMP7]], ptr [[IN2]], align 16 ; COMBINE-NEXT: [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1 ; COMBINE-NEXT: [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float> ; COMBINE-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> ; COMBINE-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> -; COMBINE-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> +; COMBINE-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> ; COMBINE-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> ; COMBINE-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> ; COMBINE-NEXT: [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll index ff1e165c8c54a..34839b5140b7f 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll @@ -106,6 +106,56 @@ define i32 @load(ptr nocapture %a, ptr nocapture %b) { ret i32 %5 } +;; Check the load is deleted. +define i32 @load_not_pseudo(ptr noalias %arg, ptr noalias %arg1) { +; CHECK-LABEL: @load_not_pseudo( +; CHECK-NEXT: bb: +; CHECK-NEXT: store i32 1, ptr [[ARG1:%.*]], align 4 +; CHECK-NEXT: store i32 1, ptr [[ARG2:%.*]], align 4 +; CHECK-NEXT: ret i32 1 +; +bb: + store i32 1, ptr %arg, align 4 + store i32 1, ptr %arg1, align 4 + %i = load i32, ptr %arg, align 4 + ret i32 %i +} + +;; Check the load is deleted. +define i32 @load_not_pseudo_2(ptr noalias %arg, ptr noalias %arg1) { +; CHECK-LABEL: @load_not_pseudo_2( +; CHECK-NEXT: bb: +; CHECK-NEXT: store i32 1, ptr [[ARG:%.*]], align 4 +; CHECK-NEXT: [[ARG1_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1:%.*]], i64 4 +; CHECK-NEXT: store i32 1, ptr [[ARG1_1]], align 4 +; CHECK-NEXT: ret i32 1 +; +bb: + store i32 1, ptr %arg, align 4 + %arg1_1 = getelementptr inbounds i32, ptr %arg1, i32 1 + store i32 1, ptr %arg1_1, align 4 + %i = load i32, ptr %arg, align 4 + ret i32 %i +} + +;; Check the load is not deleted. +define i32 @load_not_pseudo_3(ptr noalias %arg, ptr noalias %arg1, ptr noalias %arg2) { +; CHECK-LABEL: @load_not_pseudo_3( +; CHECK-NEXT: bb: +; CHECK-NEXT: store i32 1, ptr [[ARG:%.*]], align 4 +; CHECK-NEXT: store i32 1, ptr [[ARG1:%.*]], align 4 +; CHECK-NEXT: store i32 1, ptr [[ARG2:%.*]], align 4 +; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[ARG]], align 4 +; CHECK-NEXT: ret i32 [[I]] +; +bb: + store i32 1, ptr %arg, align 4 + store i32 1, ptr %arg1, align 4 + store i32 1, ptr %arg2, align 4 + %i = load i32, ptr %arg, align 4 + ret i32 %i +} + ;; Check the first store is deleted. define void @dse(ptr %p) { ; CHECK-LABEL: @dse( From 6efcb44027cf509d65b4c7a2b112604bd5e3d0e9 Mon Sep 17 00:00:00 2001 From: dianqk Date: Sun, 4 May 2025 22:00:07 +0800 Subject: [PATCH 4/4] Revert "[InstCombine] Accumulate the limit only on the instructions that require" This reverts commit 0fb1679edc6477fea32e52a33c2bf92c6c57c26c. --- llvm/lib/Analysis/Loads.cpp | 14 +----- .../Coroutines/coro-retcon-resume-values.ll | 9 ++-- .../JumpThreading/unreachable-loops.ll | 8 +-- .../LowerMatrixIntrinsics/multiply-fused.ll | 24 ++++++--- .../early-arg-attrs-inference.ll | 2 +- .../PhaseOrdering/pr137810-forward-load.ll | 27 ++++++++-- .../SLPVectorizer/revec-shufflevector.ll | 5 +- .../SampleProfile/pseudo-probe-instcombine.ll | 50 ------------------- 8 files changed, 54 insertions(+), 85 deletions(-) diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index a8d45bbbe2974..b461c41d29e84 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -539,16 +539,6 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr, return LoadRange.intersectWith(StoreRange).isEmptySet(); } -static bool maybeAvailableLoadStore(Instruction *Inst) { - switch (Inst->getOpcode()) { - case Instruction::Load: - case Instruction::Store: - return true; - default: - return isa(Inst); - } -} - static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr, Type *AccessTy, bool AtLeastAtomic, const DataLayout &DL, bool *IsLoadCSE) { @@ -663,7 +653,7 @@ Value *llvm::findAvailablePtrLoadStore( ++(*NumScanedInst); // Don't scan huge blocks. - if (maybeAvailableLoadStore(Inst) && MaxInstsToScan-- == 0) + if (MaxInstsToScan-- == 0) return nullptr; --ScanFrom; @@ -744,7 +734,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, if (Inst.isDebugOrPseudoInst()) continue; - if (maybeAvailableLoadStore(&Inst) && MaxInstsToScan-- == 0) + if (MaxInstsToScan-- == 0) return nullptr; Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll index bf78174533d5a..907d7e588ffe0 100644 --- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll +++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll @@ -38,18 +38,15 @@ define i32 @main() { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = tail call ptr @allocate(i32 12) ; CHECK-NEXT: store i32 1, ptr [[TMP0]], align 4 -; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]]) ; CHECK-NEXT: [[N_VAL3_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4 -; CHECK-NEXT: store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0]] +; CHECK-NEXT: store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias [[META0:![0-9]+]] ; CHECK-NEXT: [[INPUT_SPILL_ADDR_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: store i32 2, ptr [[INPUT_SPILL_ADDR_I]], align 4, !noalias [[META0]] -; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]]) ; CHECK-NEXT: [[INPUT_RELOAD_ADDR13_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 8 ; CHECK-NEXT: [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 4 -; CHECK-NEXT: store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3]] +; CHECK-NEXT: store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias [[META3:![0-9]+]] ; CHECK-NEXT: store i32 4, ptr [[INPUT_RELOAD_ADDR13_I]], align 4, !noalias [[META3]] -; CHECK-NEXT: tail call void @llvm.experimental.noalias.scope.decl(metadata [[META6:![0-9]+]]) -; CHECK-NEXT: tail call void @print(i32 7), !noalias [[META6]] +; CHECK-NEXT: tail call void @print(i32 7), !noalias [[META6:![0-9]+]] ; CHECK-NEXT: tail call void @deallocate(ptr nonnull [[TMP0]]), !noalias [[META6]] ; CHECK-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll index f4d5fc9a26728..79c5e9217312d 100644 --- a/llvm/test/Transforms/JumpThreading/unreachable-loops.ll +++ b/llvm/test/Transforms/JumpThreading/unreachable-loops.ll @@ -191,8 +191,11 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) { ; CHECK-LABEL: @constant_phi_leads_to_self_reference( ; CHECK-NEXT: [[A9:%.*]] = alloca i1, align 1 ; CHECK-NEXT: br label [[F6:%.*]] -; CHECK: BB5.thread: +; CHECK: T3: ; CHECK-NEXT: br label [[BB5:%.*]] +; CHECK: BB5: +; CHECK-NEXT: [[L10:%.*]] = load i1, ptr [[A9]], align 1 +; CHECK-NEXT: br i1 [[L10]], label [[BB6:%.*]], label [[F6]] ; CHECK: BB6: ; CHECK-NEXT: [[LGV3:%.*]] = load i1, ptr [[PTR:%.*]], align 1 ; CHECK-NEXT: [[C4:%.*]] = icmp sle i1 [[C4]], true @@ -201,8 +204,7 @@ define i32 @constant_phi_leads_to_self_reference(ptr %ptr) { ; CHECK: F6: ; CHECK-NEXT: ret i32 0 ; CHECK: F7: -; CHECK-NEXT: [[L10_PR:%.*]] = load i1, ptr [[A9]], align 1 -; CHECK-NEXT: br i1 [[L10_PR]], label [[BB5]], label [[F6]] +; CHECK-NEXT: br label [[BB5]] ; %A9 = alloca i1, align 1 br i1 false, label %BB4, label %F6 diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll index 6cbbb534b98b0..155f7755c2095 100644 --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll @@ -263,17 +263,21 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C) ; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[VEC_GEP34:%.*]] = getelementptr i8, ptr [[C]], i64 32 ; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[VEC_GEP34]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 16 +; CHECK-NEXT: [[COL_LOAD35:%.*]] = load <2 x double>, ptr [[TMP10]], align 8 +; CHECK-NEXT: [[VEC_GEP36:%.*]] = getelementptr i8, ptr [[A]], i64 48 +; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[VEC_GEP36]], align 8 ; CHECK-NEXT: [[COL_LOAD38:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[VEC_GEP39:%.*]] = getelementptr i8, ptr [[A]], i64 32 ; CHECK-NEXT: [[COL_LOAD40:%.*]] = load <2 x double>, ptr [[VEC_GEP39]], align 8 ; CHECK-NEXT: [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT43]] +; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT43]] ; CHECK-NEXT: [[SPLAT_SPLAT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT46]], <2 x double> [[TMP11]]) ; CHECK-NEXT: [[SPLAT_SPLAT49:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD17]], [[SPLAT_SPLAT49]] +; CHECK-NEXT: [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT49]] ; CHECK-NEXT: [[SPLAT_SPLAT52:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD19]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]]) +; CHECK-NEXT: [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]]) ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[A]], i64 80 ; CHECK-NEXT: [[COL_LOAD53:%.*]] = load <2 x double>, ptr [[TMP15]], align 8 ; CHECK-NEXT: [[VEC_GEP54:%.*]] = getelementptr i8, ptr [[A]], i64 112 @@ -309,18 +313,22 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C) ; CHECK-NEXT: [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT88]] ; CHECK-NEXT: [[SPLAT_SPLAT91:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT91]], <2 x double> [[TMP25]]) +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[A]], i64 64 +; CHECK-NEXT: [[COL_LOAD92:%.*]] = load <2 x double>, ptr [[TMP27]], align 8 +; CHECK-NEXT: [[VEC_GEP93:%.*]] = getelementptr i8, ptr [[A]], i64 96 +; CHECK-NEXT: [[COL_LOAD94:%.*]] = load <2 x double>, ptr [[VEC_GEP93]], align 8 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[A]], i64 80 ; CHECK-NEXT: [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[TMP28]], align 8 ; CHECK-NEXT: [[VEC_GEP96:%.*]] = getelementptr i8, ptr [[A]], i64 112 ; CHECK-NEXT: [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[VEC_GEP96]], align 8 ; CHECK-NEXT: [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]]) +; CHECK-NEXT: [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]]) ; CHECK-NEXT: [[SPLAT_SPLAT104:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP27]]) +; CHECK-NEXT: [[TMP30:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT104]], <2 x double> [[TMP29]]) ; CHECK-NEXT: [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]]) +; CHECK-NEXT: [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]]) ; CHECK-NEXT: [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD79]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP29]]) +; CHECK-NEXT: [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP31]]) ; CHECK-NEXT: [[TMP33:%.*]] = getelementptr i8, ptr [[C]], i64 64 ; CHECK-NEXT: store <2 x double> [[TMP30]], ptr [[TMP33]], align 8 ; CHECK-NEXT: [[VEC_GEP112:%.*]] = getelementptr i8, ptr [[C]], i64 96 diff --git a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll index b14d5e590ae7b..93a8c803aba37 100644 --- a/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll +++ b/llvm/test/Transforms/PhaseOrdering/early-arg-attrs-inference.ll @@ -3,7 +3,7 @@ define i32 @f(ptr noalias %p, i32 %c) { ; CHECK-LABEL: define noundef i32 @f -; CHECK-SAME: (ptr noalias readnone captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr { +; CHECK-SAME: (ptr noalias readonly captures(none) [[P:%.*]], i32 [[C:%.*]]) local_unnamed_addr { ; CHECK-NEXT: tail call void @g() ; CHECK-NEXT: tail call void @g() ; CHECK-NEXT: tail call void @g() diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll index 98eb90c184d74..006f15a31c4e1 100644 --- a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll +++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll @@ -1,16 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -O2 -S < %s | FileCheck %s +; FIXME: It can return true. define i1 @main(ptr %i2) { ; CHECK-LABEL: define noundef i1 @main( -; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TRUE:.*:]] +; CHECK-SAME: ptr captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr { +; CHECK-NEXT: [[I1:%.*]] = alloca [3 x i8], align 1 ; CHECK-NEXT: store i8 0, ptr [[I2]], align 1 ; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1 ; CHECK-NEXT: store i8 1, ptr [[I3]], align 1 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2 ; CHECK-NEXT: store i8 2, ptr [[I4]], align 1 -; CHECK-NEXT: ret i1 true +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false) +; CHECK-NEXT: [[I51:%.*]] = load i8, ptr [[I2]], align 1 +; CHECK-NEXT: [[I6:%.*]] = icmp eq i8 [[I51]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1 +; CHECK-NEXT: [[I82:%.*]] = load i8, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[I9:%.*]] = icmp eq i8 [[I82]], 1 +; CHECK-NEXT: [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2 +; CHECK-NEXT: [[I123:%.*]] = load i8, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[I13:%.*]] = icmp eq i8 [[I123]], 2 +; CHECK-NEXT: [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false +; CHECK-NEXT: br i1 [[I14]], label %[[TRUE:.*]], label %[[FALSE:.*]] +; CHECK: [[COMMON_RET:.*]]: +; CHECK-NEXT: ret i1 [[I14]] +; CHECK: [[TRUE]]: +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 3, ptr nonnull [[I1]]) +; CHECK-NEXT: br label %[[COMMON_RET]] +; CHECK: [[FALSE]]: +; CHECK-NEXT: call void @assert_failed(ptr nonnull [[I1]]) +; CHECK-NEXT: br label %[[COMMON_RET]] ; %i1 = alloca [3 x i8], align 1 store i8 0, ptr %i2, align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll index d91dfc01649bc..b85c78ec8d2d0 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll @@ -231,13 +231,14 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) { ; COMBINE-NEXT: [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]] ; COMBINE-NEXT: [[GEP10:%.*]] = getelementptr inbounds nuw i8, ptr [[IN1]], i64 32 ; COMBINE-NEXT: [[GEP11:%.*]] = getelementptr inbounds nuw i8, ptr [[IN2:%.*]], i64 128 +; COMBINE-NEXT: [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; COMBINE-NEXT: store <32 x float> [[TMP7]], ptr [[IN2]], align 16 ; COMBINE-NEXT: [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1 ; COMBINE-NEXT: [[TMP9:%.*]] = uitofp <16 x i8> [[LOAD5]] to <16 x float> ; COMBINE-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <16 x i32> ; COMBINE-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP10]], <16 x float> [[TMP11]], <16 x i32> -; COMBINE-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> +; COMBINE-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> ; COMBINE-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <16 x i32> ; COMBINE-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP12]], <16 x float> [[TMP14]], <16 x i32> ; COMBINE-NEXT: [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll index 34839b5140b7f..ff1e165c8c54a 100644 --- a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll +++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll @@ -106,56 +106,6 @@ define i32 @load(ptr nocapture %a, ptr nocapture %b) { ret i32 %5 } -;; Check the load is deleted. -define i32 @load_not_pseudo(ptr noalias %arg, ptr noalias %arg1) { -; CHECK-LABEL: @load_not_pseudo( -; CHECK-NEXT: bb: -; CHECK-NEXT: store i32 1, ptr [[ARG1:%.*]], align 4 -; CHECK-NEXT: store i32 1, ptr [[ARG2:%.*]], align 4 -; CHECK-NEXT: ret i32 1 -; -bb: - store i32 1, ptr %arg, align 4 - store i32 1, ptr %arg1, align 4 - %i = load i32, ptr %arg, align 4 - ret i32 %i -} - -;; Check the load is deleted. -define i32 @load_not_pseudo_2(ptr noalias %arg, ptr noalias %arg1) { -; CHECK-LABEL: @load_not_pseudo_2( -; CHECK-NEXT: bb: -; CHECK-NEXT: store i32 1, ptr [[ARG:%.*]], align 4 -; CHECK-NEXT: [[ARG1_1:%.*]] = getelementptr inbounds nuw i8, ptr [[ARG1:%.*]], i64 4 -; CHECK-NEXT: store i32 1, ptr [[ARG1_1]], align 4 -; CHECK-NEXT: ret i32 1 -; -bb: - store i32 1, ptr %arg, align 4 - %arg1_1 = getelementptr inbounds i32, ptr %arg1, i32 1 - store i32 1, ptr %arg1_1, align 4 - %i = load i32, ptr %arg, align 4 - ret i32 %i -} - -;; Check the load is not deleted. -define i32 @load_not_pseudo_3(ptr noalias %arg, ptr noalias %arg1, ptr noalias %arg2) { -; CHECK-LABEL: @load_not_pseudo_3( -; CHECK-NEXT: bb: -; CHECK-NEXT: store i32 1, ptr [[ARG:%.*]], align 4 -; CHECK-NEXT: store i32 1, ptr [[ARG1:%.*]], align 4 -; CHECK-NEXT: store i32 1, ptr [[ARG2:%.*]], align 4 -; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[ARG]], align 4 -; CHECK-NEXT: ret i32 [[I]] -; -bb: - store i32 1, ptr %arg, align 4 - store i32 1, ptr %arg1, align 4 - store i32 1, ptr %arg2, align 4 - %i = load i32, ptr %arg, align 4 - ret i32 %i -} - ;; Check the first store is deleted. define void @dse(ptr %p) { ; CHECK-LABEL: @dse(