diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h index 639070c07897b..94d761379a9c5 100644 --- a/llvm/include/llvm/Analysis/Loads.h +++ b/llvm/include/llvm/Analysis/Loads.h @@ -153,9 +153,10 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, /// This overload provides a more efficient implementation of /// FindAvailableLoadedValue() for the case where we are not interested in /// finding the closest clobbering instruction if no available load is found. -/// This overload cannot be used to scan across multiple blocks. +/// This overload cannot be used to scan across multiple blocks. If a memcpy is +/// returned, it indicates that we can load from its source. Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, - bool *IsLoadCSE, + bool *IsLoadCSE, int64_t &Offset, unsigned MaxInstsToScan = DefMaxInstsToScan); /// Scan backwards to see if we have the value of the given pointer available diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 425f3682122cd..f766331dab2f1 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -713,8 +713,31 @@ Value *llvm::findAvailablePtrLoadStore( return nullptr; } +static Value *availableMemCpySrc(LoadInst *LI, MemCpyInst *MemCpy, + int64_t &Offset) { + if (!LI->isSimple() || MemCpy->isVolatile()) + return nullptr; + const DataLayout &DL = LI->getDataLayout(); + u_int64_t Size = DL.getTypeStoreSize(LI->getType()).getKnownMinValue(); + if (Size == 0) + return nullptr; + Value *OldSrc = LI->getPointerOperand(); + + if (OldSrc != MemCpy->getDest()) { + std::optional PointerOffset = + OldSrc->getPointerOffsetFrom(MemCpy->getDest(), DL); + if (!PointerOffset || *PointerOffset < 0) + return nullptr; + Offset = *PointerOffset; + } + auto *CopyLen = dyn_cast(MemCpy->getLength()); + if (!CopyLen || CopyLen->getZExtValue() < Size + Offset) + return nullptr; + return MemCpy; +} + Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, - bool *IsLoadCSE, + bool *IsLoadCSE, int64_t &Offset, unsigned MaxInstsToScan) { const DataLayout &DL = Load->getDataLayout(); Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts(); @@ -739,6 +762,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE); + if (auto *MemCpy = dyn_cast(&Inst)) + Available = availableMemCpySrc(Load, MemCpy, Offset); + if (Available) break; @@ -753,6 +779,12 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, for (Instruction *Inst : MustNotAliasInsts) if (isModSet(AA.getModRefInfo(Inst, Loc))) return nullptr; + if (auto *MemCpy = dyn_cast(Available)) { + MemoryLocation Loc = MemoryLocation::getForSource(MemCpy); + for (Instruction *Inst : MustNotAliasInsts) + if (isModSet(AA.getModRefInfo(Inst, Loc))) + return nullptr; + } } return Available; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index c29cba6f675c5..cf0ebc9fd043f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1053,13 +1053,45 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) { // separated by a few arithmetic operations. bool IsLoadCSE = false; BatchAAResults BatchAA(*AA); - if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) { + int64_t Offset = 0; + if (Value *AvailableVal = + FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE, Offset)) { if (IsLoadCSE) combineMetadataForCSE(cast(AvailableVal), &LI, false); - return replaceInstUsesWith( - LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(), - LI.getName() + ".cast")); + /// Perform simplification of load's. If we have memcpy A which copies X to + /// Y, and load instruction B which loads from Y, then we can rewrite B to + /// be a load instruction loads from X. This allows later passes to remove + /// the memcpy A or identify the source of the load instruction. + if (auto *MemCpy = dyn_cast(AvailableVal)) { + Value *NewSrc = MemCpy->getSource(); + Value *OldSrc = LI.getPointerOperand(); + MaybeAlign NewAlign = MemCpy->getSourceAlign(); + if (Offset != 0) { + if (NewAlign.has_value()) + NewAlign = commonAlignment(*NewAlign, Offset); + // Avoid increasing instructions + if (isa(OldSrc) && OldSrc->hasOneUse()) + NewSrc = + Builder.CreateInBoundsPtrAdd(NewSrc, Builder.getInt64(Offset)); + else + NewSrc = nullptr; + } + // Avoid infinite loops + if (NewSrc && !BatchAA.isMustAlias(OldSrc, NewSrc)) + AvailableVal = Builder.CreateAlignedLoad(LI.getType(), NewSrc, NewAlign, + LI.getName()); + else { + AvailableVal = nullptr; + if (NewSrc && NewSrc->use_empty()) + cast(NewSrc)->eraseFromParent(); + } + } else + AvailableVal = Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(), + LI.getName() + ".cast"); + + if (AvailableVal) + return replaceInstUsesWith(LI, AvailableVal); } // None of the following transforms are legal for volatile/ordered atomic diff --git a/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll new file mode 100644 index 0000000000000..7a56bb50b0903 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/memcpy-forward-load.ll @@ -0,0 +1,169 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +define i24 @forward_load(ptr align 4 %src) { +; CHECK-LABEL: define i24 @forward_load( +; CHECK-SAME: ptr align 4 [[SRC:%.*]]) { +; CHECK-NEXT: [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4 +; CHECK-NEXT: ret i24 [[VAL1]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + %val = load i24, ptr %dest + ret i24 %val +} + +define i8 @forward_load_gep(ptr %src) { +; CHECK-LABEL: define i8 @forward_load_gep( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2 +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + %gep = getelementptr inbounds i8, ptr %dest, i64 2 + %val = load i8, ptr %gep + ret i8 %val +} + +define i17 @forward_load_padding(ptr %src) { +; CHECK-LABEL: define i17 @forward_load_padding( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[VAL:%.*]] = load i17, ptr [[SRC]], align 1 +; CHECK-NEXT: ret i17 [[VAL]] +; + %dest = alloca [5 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + %val = load i17, ptr %dest + ret i17 %val +} + +define <2 x i8> @forward_load_vector(ptr %src) { +; CHECK-LABEL: define <2 x i8> @forward_load_vector( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[SRC]], align 1 +; CHECK-NEXT: ret <2 x i8> [[TMP1]] +; + %dest = alloca <2 x i8> + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + %val = load <2 x i8>, ptr %dest + ret <2 x i8> %val +} + +; Negative tests + +define i24 @forward_load_volatile(ptr %src) { +; CHECK-LABEL: define i24 @forward_load_volatile( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false) +; CHECK-NEXT: [[VAL:%.*]] = load volatile i24, ptr [[DEST]], align 4 +; CHECK-NEXT: ret i24 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + %val = load volatile i24, ptr %dest + ret i24 %val +} + +define i24 @failed_forward_load_write_src(ptr %src) { +; CHECK-LABEL: define i24 @failed_forward_load_write_src( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false) +; CHECK-NEXT: store i1 true, ptr [[SRC]], align 1 +; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4 +; CHECK-NEXT: ret i24 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + store i1 true, ptr %src + %val = load i24, ptr %dest + ret i24 %val +} + +define i24 @failed_forward_load_write_dest(ptr %src) { +; CHECK-LABEL: define i24 @failed_forward_load_write_dest( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false) +; CHECK-NEXT: store i1 true, ptr [[DEST]], align 1 +; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST]], align 4 +; CHECK-NEXT: ret i24 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + store i1 true, ptr %dest + %val = load i24, ptr %dest + ret i24 %val +} + +define i16 @failed_forward_load_size(ptr %src) { +; CHECK-LABEL: define i16 @failed_forward_load_size( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[SRC]], align 1 +; CHECK-NEXT: store i8 [[TMP1]], ptr [[DEST]], align 1 +; CHECK-NEXT: [[VAL:%.*]] = load i16, ptr [[DEST]], align 2 +; CHECK-NEXT: ret i16 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false) + %val = load i16, ptr %dest + ret i16 %val +} + +define i8 @failed_forward_load_gep(ptr %src) { +; CHECK-LABEL: define i8 @failed_forward_load_gep( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 1 +; CHECK-NEXT: store i16 [[TMP1]], ptr [[DEST]], align 1 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2 +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[VAL]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false) + %gep = getelementptr inbounds i8, ptr %dest, i64 2 + %val = load i8, ptr %gep + ret i8 %val +} + +define i8 @failed_forward_load_gep_multi_use(ptr %src) { +; CHECK-LABEL: define i8 @failed_forward_load_gep_multi_use( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[DEST:%.*]] = alloca [3 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC]], i64 3, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[DEST]], i64 2 +; CHECK-NEXT: [[VAL1:%.*]] = load i8, ptr [[GEP]], align 1 +; CHECK-NEXT: call void @use_ptr(ptr nonnull [[GEP]]) +; CHECK-NEXT: ret i8 [[VAL1]] +; + %dest = alloca [3 x i8] + call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false) + %gep = getelementptr inbounds i8, ptr %dest, i64 2 + %val = load i8, ptr %gep + call void @use_ptr(ptr %gep) + ret i8 %val +} + +define i24 @failed_forward_load_must_alias(ptr %src) { +; CHECK-LABEL: define i24 @failed_forward_load_must_alias( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[SRC_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2 +; CHECK-NEXT: [[DEST_GEP:%.*]] = getelementptr inbounds nuw i8, ptr [[SRC]], i64 2 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[DEST_GEP]], ptr noundef nonnull align 1 dereferenceable(3) [[SRC_GEP]], i64 3, i1 false) +; CHECK-NEXT: [[VAL:%.*]] = load i24, ptr [[DEST_GEP]], align 4 +; CHECK-NEXT: ret i24 [[VAL]] +; + %src_gep = getelementptr inbounds i8, ptr %src, i64 2 + %dest_gep = getelementptr inbounds i8, ptr %src, i64 2 + call void @llvm.memcpy.p0.p0.i64(ptr %dest_gep, ptr %src_gep, i64 3, i1 false) + %val = load i24, ptr %dest_gep + ret i24 %val +} + +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) +declare void @use_ptr(ptr) diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll index f084fe38bb226..431870155ae83 100644 --- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll +++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll @@ -208,8 +208,7 @@ define i32 @test_memcpy_after_phi(i1 %cond, ptr %ptr) { ; CHECK: join: ; CHECK-NEXT: [[PHI:%.*]] = phi ptr [ [[A]], [[IF]] ], [ [[PTR:%.*]], [[ENTRY:%.*]] ] ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PHI]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false) -; CHECK-NEXT: [[V:%.*]] = load i32, ptr [[PHI]], align 4 -; CHECK-NEXT: ret i32 [[V]] +; CHECK-NEXT: ret i32 0 ; entry: %a = alloca [32 x i8] @@ -384,8 +383,7 @@ define i8 @select_after_memcpy_keep_alloca(i1 %cond, ptr %p) { ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 1 ; CHECK-NEXT: [[PTR:%.*]] = select i1 [[COND:%.*]], ptr [[ALLOCA]], ptr [[P:%.*]] ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(32) [[PTR]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false) -; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[PTR]], align 1 -; CHECK-NEXT: ret i8 [[LOAD]] +; CHECK-NEXT: ret i8 0 ; entry: %alloca = alloca [32 x i8] diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll new file mode 100644 index 0000000000000..d5dc213e6d6b6 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -O2 -S < %s | FileCheck %s + +define i1 @main(ptr %i2) { +; CHECK-LABEL: define noundef i1 @main( +; CHECK-SAME: ptr writeonly captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[COMMON_RET:.*:]] +; CHECK-NEXT: store i8 0, ptr [[I2]], align 1 +; CHECK-NEXT: [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1 +; CHECK-NEXT: store i8 1, ptr [[I3]], align 1 +; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2 +; CHECK-NEXT: store i8 2, ptr [[I4]], align 1 +; CHECK-NEXT: ret i1 true +; + %i1 = alloca [3 x i8], align 1 + store i8 0, ptr %i2, align 1 + %i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1 + store i8 1, ptr %i3, align 1 + %i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2 + store i8 2, ptr %i4, align 1 + call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1) + call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false) + %i5 = load i8, ptr %i1, align 1 + %i6 = icmp eq i8 %i5, 0 + %i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1 + %i8 = load i8, ptr %i7, align 1 + %i9 = icmp eq i8 %i8, 1 + %i10 = select i1 %i6, i1 %i9, i1 false + %i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2 + %i12 = load i8, ptr %i11, align 1 + %i13 = icmp eq i8 %i12, 2 + %i14 = select i1 %i10, i1 %i13, i1 false + br i1 %i14, label %true, label %false + +true: + call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1) + ret i1 true + +false: + call void @assert_failed(ptr %i1) + ret i1 false +} + +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) +declare void @llvm.lifetime.start.p0(i64, ptr) +declare void @llvm.lifetime.end.p0(i64, ptr) +declare void @assert_failed(ptr)