From fc89492789acf5657e2cdd4663f946e78cf662f5 Mon Sep 17 00:00:00 2001 From: DianQK Date: Tue, 2 Apr 2024 08:46:38 +0800 Subject: [PATCH 01/12] Pre-commit test cases --- .../MemCpyOpt/memcpy-memcpy-offset.ll | 195 ++++++++++++++++++ .../Transforms/PhaseOrdering/memcpy-offset.ll | 42 ++++ 2 files changed, 237 insertions(+) create mode 100644 llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll create mode 100644 llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll new file mode 100644 index 0000000000000..fe5056d85dcd3 --- /dev/null +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s + +%buf = type [9 x i8] + +; We can forward `memcpy` because the copy location are the same, +define void @forward_offset(ptr %dep_src) { +; CHECK-LABEL: define void @forward_offset( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + ret void +} + +; We need to update the align value of the source of `memcpy` when forwarding. +define void @forward_offset_align(ptr %dep_src) { +; CHECK-LABEL: define void @forward_offset_align( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false) +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3 +; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 3 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false) + %src = getelementptr inbounds i8, ptr %dep_dest, i64 3 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 3 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + ret void +} + +; We can change the align value to 2 when forwarding. +define void @forward_offset_align_2(ptr %dep_src) { +; CHECK-LABEL: define void @forward_offset_align_2( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false) +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2 +; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false) + %src = getelementptr inbounds i8, ptr %dep_dest, i64 2 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 2 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + ret void +} + +; We need to create a GEP instruction when forwarding. +define void @forward_offset_with_gep(ptr %dep_src) { +; CHECK-LABEL: define void @forward_offset_with_gep( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[DEP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 2 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + ret void +} + +; Make sure we pass the right parameters when calling `memcpy`. +define void @forward_offset_memcpy(ptr %dep_src) { +; CHECK-LABEL: define void @forward_offset_memcpy( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: call void @use(ptr [[DEST]]) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + %dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + call void @use(ptr %dest) + ret void +} + +; Make sure we pass the right parameters when calling `memcpy.inline`. +define void @forward_offset_memcpy_inline(ptr %dep_src) { +; CHECK-LABEL: define void @forward_offset_memcpy_inline( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: call void @use(ptr [[DEST]]) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + %dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + call void @use(ptr %dest) + ret void +} + +; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on. +define void @do_not_forward_oversize_offset(ptr %dep_src) { +; CHECK-LABEL: define void @do_not_forward_oversize_offset( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 6, i1 false) +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 6, i1 false) + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + ret void +} + +; We can forward `memcpy` because the write operation does not corrupt the location to be copied. +define void @forward_offset_and_store(ptr %dep_src) { +; CHECK-LABEL: define void @forward_offset_and_store( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) +; CHECK-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 +; CHECK-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 +; CHECK-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 5, i1 false) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) + store i8 1, ptr %dep_src, align 1 + %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6 + store i8 1, ptr %dep_src_end, align 1 + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + ret void +} + +; We cannot forward `memcpy` because the write operation alters the location to be copied. +; Also, make sure we have removed the GEP instruction that was created temporarily. +define void @do_not_forward_offset_and_store(ptr %dep_src) { +; CHECK-LABEL: define void @do_not_forward_offset_and_store( +; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) +; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: store i8 1, ptr [[DEP]], align 1 +; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false) +; CHECK-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) + %dep_src_offset = getelementptr inbounds i8, ptr %dep_src, i64 1 + store i8 1, ptr %dep_src_offset, align 1 + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 2 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + ret void +} + +declare void @use(ptr) + +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) +declare void @llvm.memcpy.inline.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll new file mode 100644 index 0000000000000..0d34932937eee --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll @@ -0,0 +1,42 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt < %s -passes=memcpyopt,dse,instcombine -S -verify-memoryssa | FileCheck --check-prefix=CUSTOM %s +; RUN: opt < %s -O2 -S | FileCheck --check-prefix=O2 %s + +%buf = type [7 x i8] + +; Check that we eliminate all `memcpy` calls in this function. +define void @forward_offset_and_store(ptr %dep_src) { +; CUSTOM-LABEL: define void @forward_offset_and_store( +; CUSTOM-SAME: ptr [[DEP_SRC:%.*]]) { +; CUSTOM-NEXT: [[DEP_DEST:%.*]] = alloca [7 x i8], align 1 +; CUSTOM-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false) +; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 +; CUSTOM-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 +; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 +; CUSTOM-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CUSTOM-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CUSTOM-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false) +; CUSTOM-NEXT: ret void +; +; O2-LABEL: define void @forward_offset_and_store( +; O2-SAME: ptr nocapture [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; O2-NEXT: [[DEP_DEST:%.*]] = alloca [7 x i8], align 1 +; O2-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false) +; O2-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 +; O2-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 +; O2-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 +; O2-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; O2-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; O2-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false) +; O2-NEXT: ret void +; + %dep_dest = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) + store i8 1, ptr %dep_src, align 1 + %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6 + store i8 1, ptr %dep_src_end, align 1 + %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 + %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + ret void +} From d5a966c9ecb484ea1777b7417bc151046318fe79 Mon Sep 17 00:00:00 2001 From: DianQK Date: Tue, 2 Apr 2024 08:47:44 +0800 Subject: [PATCH 02/12] [MemCpyOpt] Calculate the offset value to forward `memcpy` --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 85 +++++++++++++------ .../MemCpyOpt/memcpy-memcpy-offset.ll | 17 ++-- .../Transforms/PhaseOrdering/memcpy-offset.ll | 12 +-- 3 files changed, 72 insertions(+), 42 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9bf87f2370531..e41c86069fb4e 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator_range.h" @@ -1124,28 +1125,67 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, BatchAAResults &BAA) { - // We can only transforms memcpy's where the dest of one is the source of the - // other. - if (M->getSource() != MDep->getDest() || MDep->isVolatile()) - return false; - // If dep instruction is reading from our current input, then it is a noop - // transfer and substituting the input won't change this instruction. Just - // ignore the input and let someone else zap MDep. This handles cases like: + // transfer and substituting the input won't change this instruction. Just + // ignore the input and let someone else zap MDep. This handles cases like: // memcpy(a <- a) // memcpy(b <- a) if (M->getSource() == MDep->getSource()) return false; - // Second, the length of the memcpy's must be the same, or the preceding one + // We can only optimize non-volatile memcpy's. + if (MDep->isVolatile()) + return false; + + int64_t MForwardOffset = 0; + const DataLayout &DL = M->getModule()->getDataLayout(); + // We can only transforms memcpy's where the dest of one is the source of the + // other, or they have an offset in a range. + if (M->getSource() != MDep->getDest()) { + std::optional Offset = + M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL); + if (!Offset || *Offset < 0) + return false; + MForwardOffset = *Offset; + } + + // The length of the memcpy's must be the same, or the preceding one // must be larger than the following one. - if (MDep->getLength() != M->getLength()) { + if (MForwardOffset != 0 || (MDep->getLength() != M->getLength())) { auto *MDepLen = dyn_cast(MDep->getLength()); auto *MLen = dyn_cast(M->getLength()); - if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) + if (!MDepLen || !MLen || + MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset) return false; } + IRBuilder<> Builder(M); + auto *CopySource = MDep->getRawSource(); + auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] { + if (CopySource->use_empty()) + cast(CopySource)->eraseFromParent(); + }); + MaybeAlign CopySourceAlign = MDep->getSourceAlign(); + // We just need to calculate the actual size of the copy. + auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize( + MemoryLocation::getForSource(M).Size); + + // We need to update `MCopyLoc` if an offset exists. + if (MForwardOffset > 0) { + // The copy destination of `M` maybe can serve as the source of copying. + std::optional MDestOffset = + M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL); + if (MDestOffset && *MDestOffset == MForwardOffset) + CopySource = M->getRawDest(); + else + CopySource = Builder.CreateInBoundsPtrAdd( + CopySource, ConstantInt::get(Type::getInt64Ty(Builder.getContext()), + MForwardOffset)); + MCopyLoc = MCopyLoc.getWithNewPtr(CopySource); + if (CopySourceAlign) + CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset); + } + // Verify that the copied-from memory doesn't change in between the two // transfers. For example, in: // memcpy(a <- b) @@ -1155,10 +1195,8 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // // TODO: If the code between M and MDep is transparent to the destination "c", // then we could still perform the xform by moving M up to the first memcpy. - // TODO: It would be sufficient to check the MDep source up to the memcpy - // size of M, rather than MDep. - if (writtenBetween(MSSA, BAA, MemoryLocation::getForSource(MDep), - MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(M))) + if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep), + MSSA->getMemoryAccess(M))) return false; // No need to create `memcpy(a <- a)`. @@ -1191,23 +1229,22 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. - IRBuilder<> Builder(M); Instruction *NewM; if (UseMemMove) - NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), - MDep->getRawSource(), MDep->getSourceAlign(), - M->getLength(), M->isVolatile()); + NewM = + Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), CopySource, + CopySourceAlign, M->getLength(), M->isVolatile()); else if (isa(M)) { // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is // never allowed since that would allow the latter to be lowered as a call // to an external function. - NewM = Builder.CreateMemCpyInline( - M->getRawDest(), M->getDestAlign(), MDep->getRawSource(), - MDep->getSourceAlign(), M->getLength(), M->isVolatile()); + NewM = Builder.CreateMemCpyInline(M->getRawDest(), M->getDestAlign(), + CopySource, CopySourceAlign, + M->getLength(), M->isVolatile()); } else - NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), - MDep->getRawSource(), MDep->getSourceAlign(), - M->getLength(), M->isVolatile()); + NewM = + Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), CopySource, + CopySourceAlign, M->getLength(), M->isVolatile()); NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID); assert(isa(MSSAU->getMemorySSA()->getMemoryAccess(M))); diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index fe5056d85dcd3..4d00ea70a564d 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -11,7 +11,7 @@ define void @forward_offset(ptr %dep_src) { ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) ; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 ; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 6, i1 false) ; CHECK-NEXT: ret void ; %dep_dest = alloca %buf, align 1 @@ -30,7 +30,7 @@ define void @forward_offset_align(ptr %dep_src) { ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false) ; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3 ; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 3 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[DEST]], i64 5, i1 false) ; CHECK-NEXT: ret void ; %dep_dest = alloca %buf, align 1 @@ -49,7 +49,7 @@ define void @forward_offset_align_2(ptr %dep_src) { ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false) ; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2 ; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 2 [[DEP]], i64 6, i1 false) ; CHECK-NEXT: ret void ; %dep_dest = alloca %buf, align 1 @@ -68,7 +68,8 @@ define void @forward_offset_with_gep(ptr %dep_src) { ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) ; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 ; CHECK-NEXT: [[DEP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: ret void ; %dep_dest = alloca %buf, align 1 @@ -87,7 +88,8 @@ define void @forward_offset_memcpy(ptr %dep_src) { ; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) ; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: call void @use(ptr [[DEST]]) ; CHECK-NEXT: ret void ; @@ -108,7 +110,8 @@ define void @forward_offset_memcpy_inline(ptr %dep_src) { ; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) ; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: call void @use(ptr [[DEST]]) ; CHECK-NEXT: ret void ; @@ -151,7 +154,7 @@ define void @forward_offset_and_store(ptr %dep_src) { ; CHECK-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 ; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 ; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 5, i1 false) +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 5, i1 false) ; CHECK-NEXT: ret void ; %dep_dest = alloca %buf, align 1 diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll index 0d34932937eee..c7c05901455bc 100644 --- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll +++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll @@ -8,26 +8,16 @@ define void @forward_offset_and_store(ptr %dep_src) { ; CUSTOM-LABEL: define void @forward_offset_and_store( ; CUSTOM-SAME: ptr [[DEP_SRC:%.*]]) { -; CUSTOM-NEXT: [[DEP_DEST:%.*]] = alloca [7 x i8], align 1 -; CUSTOM-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false) ; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 ; CUSTOM-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 ; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 -; CUSTOM-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CUSTOM-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CUSTOM-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false) ; CUSTOM-NEXT: ret void ; ; O2-LABEL: define void @forward_offset_and_store( -; O2-SAME: ptr nocapture [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; O2-NEXT: [[DEP_DEST:%.*]] = alloca [7 x i8], align 1 -; O2-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEP_DEST]], ptr noundef nonnull align 1 dereferenceable(7) [[DEP_SRC]], i64 7, i1 false) +; O2-SAME: ptr nocapture writeonly [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; O2-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 ; O2-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 ; O2-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 -; O2-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; O2-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; O2-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(5) [[DEST]], ptr noundef nonnull align 1 dereferenceable(5) [[SRC]], i64 5, i1 false) ; O2-NEXT: ret void ; %dep_dest = alloca %buf, align 1 From 289b44d1b13c84136ddd55065b8b8333880567ba Mon Sep 17 00:00:00 2001 From: DianQK Date: Tue, 9 Jul 2024 20:40:30 +0800 Subject: [PATCH 03/12] Fix nits --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e41c86069fb4e..ae11ca057d47a 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1151,7 +1151,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // The length of the memcpy's must be the same, or the preceding one // must be larger than the following one. - if (MForwardOffset != 0 || (MDep->getLength() != M->getLength())) { + if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) { auto *MDepLen = dyn_cast(MDep->getLength()); auto *MLen = dyn_cast(M->getLength()); if (!MDepLen || !MLen || @@ -1175,12 +1175,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, // The copy destination of `M` maybe can serve as the source of copying. std::optional MDestOffset = M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL); - if (MDestOffset && *MDestOffset == MForwardOffset) + if (MDestOffset == MForwardOffset) CopySource = M->getRawDest(); else CopySource = Builder.CreateInBoundsPtrAdd( - CopySource, ConstantInt::get(Type::getInt64Ty(Builder.getContext()), - MForwardOffset)); + CopySource, Builder.getInt64(MForwardOffset)); MCopyLoc = MCopyLoc.getWithNewPtr(CopySource); if (CopySourceAlign) CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset); From d238bb8853f1d28c4a9617a249d7ee2652e15f2d Mon Sep 17 00:00:00 2001 From: DianQK Date: Tue, 9 Jul 2024 21:33:18 +0800 Subject: [PATCH 04/12] Add comments for the forward offset --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index ae11ca057d47a..3bbade11c2ec2 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1170,7 +1170,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize( MemoryLocation::getForSource(M).Size); - // We need to update `MCopyLoc` if an offset exists. + // When the forwarding offset is greater than 0, we transform + // memcpy(d1 <- s1) + // memcpy(d2 <- d1+o) + // to + // memcpy(d2 <- s1+o) if (MForwardOffset > 0) { // The copy destination of `M` maybe can serve as the source of copying. std::optional MDestOffset = @@ -1180,6 +1184,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, else CopySource = Builder.CreateInBoundsPtrAdd( CopySource, Builder.getInt64(MForwardOffset)); + // We need to update `MCopyLoc` if an offset exists. MCopyLoc = MCopyLoc.getWithNewPtr(CopySource); if (CopySourceAlign) CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset); From a34be7372e6bbb542f608056a0c616f4982cdc41 Mon Sep 17 00:00:00 2001 From: DianQK Date: Tue, 9 Jul 2024 21:53:01 +0800 Subject: [PATCH 05/12] Update memcpy-memcpy-offset.ll --- .../MemCpyOpt/memcpy-memcpy-offset.ll | 224 +++++++++--------- 1 file changed, 115 insertions(+), 109 deletions(-) diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index 4d00ea70a564d..447c55fd9b690 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -4,191 +4,197 @@ %buf = type [9 x i8] ; We can forward `memcpy` because the copy location are the same, -define void @forward_offset(ptr %dep_src) { +define void @forward_offset(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @forward_offset( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 6, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false) +; CHECK-NEXT: [[SRC_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false) ret void } ; We need to update the align value of the source of `memcpy` when forwarding. -define void @forward_offset_align(ptr %dep_src) { +define void @forward_offset_align(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @forward_offset_align( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false) -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3 -; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 3 -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[DEST]], i64 5, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[SRC]], i64 9, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 3 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 3 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 5, i1 false) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false) - %src = getelementptr inbounds i8, ptr %dep_dest, i64 3 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 3 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 4 %src, i64 9, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 3 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 5, i1 false) ret void } ; We can change the align value to 2 when forwarding. -define void @forward_offset_align_2(ptr %dep_src) { +define void @forward_offset_align_2(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @forward_offset_align_2( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[DEP_SRC]], i64 9, i1 false) -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2 -; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 2 [[DEP]], i64 6, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 4 [[SRC]], i64 9, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 2 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 2 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 4 %dep_src, i64 9, i1 false) - %src = getelementptr inbounds i8, ptr %dep_dest, i64 2 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 2 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 4 %src, i64 9, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 2 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false) + ret void +} + +; If the copy destination can be used as the copy source, we don't need to create a GEP instruction. +define void @forward_offset_without_gep(ptr %src) { +; CHECK-LABEL: define void @forward_offset_without_gep( +; CHECK-SAME: ptr [[SRC:%.*]]) { +; CHECK-NEXT: [[TMP:%.*]] = alloca [9 x i8], align 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP]], ptr align 1 [[SRC]], i64 7, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[TMP1]], ptr align 1 [[TMP1]], i64 6, i1 false) +; CHECK-NEXT: ret void +; + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + %dest = getelementptr inbounds i8, ptr %src, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false) ret void } ; We need to create a GEP instruction when forwarding. -define void @forward_offset_with_gep(ptr %dep_src) { +define void @forward_offset_with_gep(ptr %src) { ; CHECK-LABEL: define void @forward_offset_with_gep( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: [[DEP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP1]], ptr align 1 [[TMP1]], i64 6, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 2 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + %dest = getelementptr inbounds i8, ptr %src, i64 2 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false) ret void } ; Make sure we pass the right parameters when calling `memcpy`. -define void @forward_offset_memcpy(ptr %dep_src) { +define void @forward_offset_memcpy(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @forward_offset_memcpy( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: call void @use(ptr [[DEST]]) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - %dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false) call void @use(ptr %dest) ret void } ; Make sure we pass the right parameters when calling `memcpy.inline`. -define void @forward_offset_memcpy_inline(ptr %dep_src) { +define void @forward_offset_memcpy_inline(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @forward_offset_memcpy_inline( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: [[DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 ; CHECK-NEXT: call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: call void @use(ptr [[DEST]]) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - %dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + call void @llvm.memcpy.inline.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false) call void @use(ptr %dest) ret void } ; We cannot forward `memcpy` because it exceeds the size of `memcpy` it depends on. -define void @do_not_forward_oversize_offset(ptr %dep_src) { +define void @do_not_forward_oversize_offset(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @do_not_forward_oversize_offset( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 6, i1 false) -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 6, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP_OFFSET]], i64 6, i1 false) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 6, i1 false) - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 6, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 6, i1 false) + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 6, i1 false) ret void } ; We can forward `memcpy` because the write operation does not corrupt the location to be copied. -define void @forward_offset_and_store(ptr %dep_src) { +define void @forward_offset_and_store(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @forward_offset_and_store( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) -; CHECK-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 -; CHECK-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false) +; CHECK-NEXT: store i8 1, ptr [[SRC]], align 1 +; CHECK-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 6 ; CHECK-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEP]], ptr align 1 [[DEP]], i64 5, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP1]], i64 5, i1 false) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) - store i8 1, ptr %dep_src, align 1 - %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6 - store i8 1, ptr %dep_src_end, align 1 - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false) + store i8 1, ptr %src, align 1 + %src_end = getelementptr inbounds i8, ptr %src, i64 6 + store i8 1, ptr %src_end, align 1 + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 5, i1 false) ret void } ; We cannot forward `memcpy` because the write operation alters the location to be copied. ; Also, make sure we have removed the GEP instruction that was created temporarily. -define void @do_not_forward_offset_and_store(ptr %dep_src) { +define void @do_not_forward_offset_and_store(ptr %src, ptr %dest) { ; CHECK-LABEL: define void @do_not_forward_offset_and_store( -; CHECK-SAME: ptr [[DEP_SRC:%.*]]) { +; CHECK-SAME: ptr [[SRC:%.*]], ptr [[DEST:%.*]]) { ; CHECK-NEXT: [[DEP_DEST:%.*]] = alloca [9 x i8], align 1 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[DEP_SRC]], i64 7, i1 false) -; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEP_DEST]], ptr align 1 [[SRC]], i64 7, i1 false) +; CHECK-NEXT: [[DEP:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 ; CHECK-NEXT: store i8 1, ptr [[DEP]], align 1 -; CHECK-NEXT: [[SRC:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 -; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 2 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[SRC]], i64 5, i1 false) +; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[DEP_DEST]], i64 1 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[DEST]], ptr align 1 [[TMP_OFFSET]], i64 5, i1 false) ; CHECK-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) - %dep_src_offset = getelementptr inbounds i8, ptr %dep_src, i64 1 - store i8 1, ptr %dep_src_offset, align 1 - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 2 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + %cpy_tmp = alloca %buf, align 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %cpy_tmp, ptr align 1 %src, i64 7, i1 false) + %src_offset = getelementptr inbounds i8, ptr %src, i64 1 + store i8 1, ptr %src_offset, align 1 + %cpy_tmp_offset = getelementptr inbounds i8, ptr %cpy_tmp, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %cpy_tmp_offset, i64 5, i1 false) ret void } From 1da22cd2fb1f5f5b4917ff36b934ea4d6d01d342 Mon Sep 17 00:00:00 2001 From: DianQK Date: Tue, 9 Jul 2024 22:45:58 +0800 Subject: [PATCH 06/12] Leave some FIXME comments --- llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll | 1 + llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll | 1 + 2 files changed, 2 insertions(+) diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index 447c55fd9b690..82086eed54332 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -58,6 +58,7 @@ define void @forward_offset_align_2(ptr %src, ptr %dest) { } ; If the copy destination can be used as the copy source, we don't need to create a GEP instruction. +; FIXME: We can directly remove memcpy here. define void @forward_offset_without_gep(ptr %src) { ; CHECK-LABEL: define void @forward_offset_without_gep( ; CHECK-SAME: ptr [[SRC:%.*]]) { diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll index c7c05901455bc..a81bb91f7ede0 100644 --- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll +++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll @@ -4,6 +4,7 @@ %buf = type [7 x i8] +; FIXME: This can be done independently in memcpyopt. ; Check that we eliminate all `memcpy` calls in this function. define void @forward_offset_and_store(ptr %dep_src) { ; CUSTOM-LABEL: define void @forward_offset_and_store( From 143c92c732baf838a4bb2291cda6c6d65eec81ce Mon Sep 17 00:00:00 2001 From: DianQK Date: Thu, 11 Jul 2024 21:05:25 +0800 Subject: [PATCH 07/12] Update due to rebase --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 12 ++++++------ .../Transforms/MemCpyOpt/memcpy-memcpy-offset.ll | 3 --- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 3bbade11c2ec2..ab01ef50c5e91 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1160,7 +1160,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, } IRBuilder<> Builder(M); - auto *CopySource = MDep->getRawSource(); + auto *CopySource = MDep->getSource(); auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] { if (CopySource->use_empty()) cast(CopySource)->eraseFromParent(); @@ -1180,7 +1180,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, std::optional MDestOffset = M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL); if (MDestOffset == MForwardOffset) - CopySource = M->getRawDest(); + CopySource = M->getDest(); else CopySource = Builder.CreateInBoundsPtrAdd( CopySource, Builder.getInt64(MForwardOffset)); @@ -1204,7 +1204,7 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, return false; // No need to create `memcpy(a <- a)`. - if (BAA.isMustAlias(M->getDest(), MDep->getSource())) { + if (BAA.isMustAlias(M->getDest(), CopySource)) { // Remove the instruction we're replacing. eraseInstruction(M); ++NumMemCpyInstr; @@ -1236,18 +1236,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, Instruction *NewM; if (UseMemMove) NewM = - Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), CopySource, + Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource, CopySourceAlign, M->getLength(), M->isVolatile()); else if (isa(M)) { // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is // never allowed since that would allow the latter to be lowered as a call // to an external function. - NewM = Builder.CreateMemCpyInline(M->getRawDest(), M->getDestAlign(), + NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), CopySource, CopySourceAlign, M->getLength(), M->isVolatile()); } else NewM = - Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), CopySource, + Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource, CopySourceAlign, M->getLength(), M->isVolatile()); NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID); diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index 82086eed54332..07fc6880746ed 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -58,15 +58,12 @@ define void @forward_offset_align_2(ptr %src, ptr %dest) { } ; If the copy destination can be used as the copy source, we don't need to create a GEP instruction. -; FIXME: We can directly remove memcpy here. define void @forward_offset_without_gep(ptr %src) { ; CHECK-LABEL: define void @forward_offset_without_gep( ; CHECK-SAME: ptr [[SRC:%.*]]) { ; CHECK-NEXT: [[TMP:%.*]] = alloca [9 x i8], align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP]], ptr align 1 [[SRC]], i64 7, i1 false) ; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 1 [[TMP1]], ptr align 1 [[TMP1]], i64 6, i1 false) ; CHECK-NEXT: ret void ; %cpy_tmp = alloca %buf, align 1 From 351eb0a035cfaa44f2f5b2dc955ae38e12bc3797 Mon Sep 17 00:00:00 2001 From: DianQK Date: Thu, 11 Jul 2024 21:51:47 +0800 Subject: [PATCH 08/12] Update memcpy-offset.ll & Disable 98321 --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 12 ++--- .../Transforms/PhaseOrdering/memcpy-offset.ll | 47 ++++++++++--------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index ab01ef50c5e91..cacc5f1b595e3 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1204,12 +1204,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, return false; // No need to create `memcpy(a <- a)`. - if (BAA.isMustAlias(M->getDest(), CopySource)) { - // Remove the instruction we're replacing. - eraseInstruction(M); - ++NumMemCpyInstr; - return true; - } + // if (BAA.isMustAlias(M->getDest(), CopySource)) { + // // Remove the instruction we're replacing. + // eraseInstruction(M); + // ++NumMemCpyInstr; + // return true; + // } // If the dest of the second might alias the source of the first, then the // source and dest might overlap. In addition, if the source of the first diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll index a81bb91f7ede0..39d4b389891fd 100644 --- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll +++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll @@ -1,33 +1,34 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt < %s -passes=memcpyopt,dse,instcombine -S -verify-memoryssa | FileCheck --check-prefix=CUSTOM %s +; RUN: opt < %s -passes=memcpyopt,instcombine -S -verify-memoryssa | FileCheck --check-prefix=CUSTOM %s ; RUN: opt < %s -O2 -S | FileCheck --check-prefix=O2 %s -%buf = type [7 x i8] - -; FIXME: This can be done independently in memcpyopt. ; Check that we eliminate all `memcpy` calls in this function. -define void @forward_offset_and_store(ptr %dep_src) { -; CUSTOM-LABEL: define void @forward_offset_and_store( -; CUSTOM-SAME: ptr [[DEP_SRC:%.*]]) { -; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 -; CUSTOM-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 -; CUSTOM-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 +define void @memcpy_forward_back_with_offset(ptr %arg) { +; CUSTOM-LABEL: define void @memcpy_forward_back_with_offset( +; CUSTOM-SAME: ptr [[ARG:%.*]]) { +; CUSTOM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 +; CUSTOM-NEXT: store i8 1, ptr [[ARG]], align 1 +; CUSTOM-NEXT: [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 +; CUSTOM-NEXT: call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false) ; CUSTOM-NEXT: ret void ; -; O2-LABEL: define void @forward_offset_and_store( -; O2-SAME: ptr nocapture writeonly [[DEP_SRC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; O2-NEXT: store i8 1, ptr [[DEP_SRC]], align 1 -; O2-NEXT: [[DEP_SRC_END:%.*]] = getelementptr inbounds i8, ptr [[DEP_SRC]], i64 6 -; O2-NEXT: store i8 1, ptr [[DEP_SRC_END]], align 1 +; O2-LABEL: define void @memcpy_forward_back_with_offset( +; O2-SAME: ptr nocapture [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; O2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 +; O2-NEXT: store i8 1, ptr [[ARG]], align 1 +; O2-NEXT: [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 +; O2-NEXT: tail call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false) ; O2-NEXT: ret void ; - %dep_dest = alloca %buf, align 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dep_dest, ptr align 1 %dep_src, i64 7, i1 false) - store i8 1, ptr %dep_src, align 1 - %dep_src_end = getelementptr inbounds i8, ptr %dep_src, i64 6 - store i8 1, ptr %dep_src_end, align 1 - %src = getelementptr inbounds i8, ptr %dep_dest, i64 1 - %dest = getelementptr inbounds i8, ptr %dep_src, i64 1 - call void @llvm.memcpy.p0.p0.i64(ptr align 1 %dest, ptr align 1 %src, i64 5, i1 false) + %i = alloca [753 x i8], align 1 + %i1 = alloca [754 x i8], align 1 + call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %arg, i64 754, i1 false) + %i2 = getelementptr inbounds i8, ptr %i1, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr %i, ptr %i2, i64 753, i1 false) + store i8 1, ptr %arg, align 1 + %i3 = getelementptr inbounds i8, ptr %arg, i64 1 + call void @llvm.memcpy.p0.p0.i64(ptr %i3, ptr %i, i64 753, i1 false) ret void } + +declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1) From c197dd108b7f2feb6ae5efef43b08659fd30993b Mon Sep 17 00:00:00 2001 From: DianQK Date: Thu, 11 Jul 2024 21:52:27 +0800 Subject: [PATCH 09/12] Re-enable 98321 --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 12 ++++++------ llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll | 8 +------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index cacc5f1b595e3..ab01ef50c5e91 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1204,12 +1204,12 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, return false; // No need to create `memcpy(a <- a)`. - // if (BAA.isMustAlias(M->getDest(), CopySource)) { - // // Remove the instruction we're replacing. - // eraseInstruction(M); - // ++NumMemCpyInstr; - // return true; - // } + if (BAA.isMustAlias(M->getDest(), CopySource)) { + // Remove the instruction we're replacing. + eraseInstruction(M); + ++NumMemCpyInstr; + return true; + } // If the dest of the second might alias the source of the first, then the // source and dest might overlap. In addition, if the source of the first diff --git a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll index 39d4b389891fd..bd910b82496fd 100644 --- a/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll +++ b/llvm/test/Transforms/PhaseOrdering/memcpy-offset.ll @@ -6,18 +6,12 @@ define void @memcpy_forward_back_with_offset(ptr %arg) { ; CUSTOM-LABEL: define void @memcpy_forward_back_with_offset( ; CUSTOM-SAME: ptr [[ARG:%.*]]) { -; CUSTOM-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 ; CUSTOM-NEXT: store i8 1, ptr [[ARG]], align 1 -; CUSTOM-NEXT: [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 -; CUSTOM-NEXT: call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false) ; CUSTOM-NEXT: ret void ; ; O2-LABEL: define void @memcpy_forward_back_with_offset( -; O2-SAME: ptr nocapture [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { -; O2-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 +; O2-SAME: ptr nocapture writeonly [[ARG:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; O2-NEXT: store i8 1, ptr [[ARG]], align 1 -; O2-NEXT: [[I3:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 1 -; O2-NEXT: tail call void @llvm.memmove.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(753) [[I3]], ptr noundef nonnull align 1 dereferenceable(753) [[TMP1]], i64 753, i1 false) ; O2-NEXT: ret void ; %i = alloca [753 x i8], align 1 From a43d005560d346513af013b5f23c08358595968a Mon Sep 17 00:00:00 2001 From: DianQK Date: Thu, 11 Jul 2024 22:02:11 +0800 Subject: [PATCH 10/12] Re-generate lifetime.ll --- llvm/test/Transforms/MemCpyOpt/lifetime.ll | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/test/Transforms/MemCpyOpt/lifetime.ll b/llvm/test/Transforms/MemCpyOpt/lifetime.ll index 1a07e6ce7476c..615887474aaaa 100644 --- a/llvm/test/Transforms/MemCpyOpt/lifetime.ll +++ b/llvm/test/Transforms/MemCpyOpt/lifetime.ll @@ -124,7 +124,7 @@ define void @call_slot_lifetime_bitcast(ptr %ptr) { ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP2]], ptr align 4 [[PTR:%.*]], i64 4, i1 false) ; CHECK-NEXT: [[TMP1_CAST:%.*]] = bitcast ptr [[TMP1]] to ptr ; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[TMP1_CAST]]) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1_CAST]], ptr align 4 [[PTR]], i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP1]], ptr align 4 [[PTR]], i64 4, i1 false) ; CHECK-NEXT: ret void ; %tmp1 = alloca i32 From c828a2fecf6432b647c515633dd6f589aa2a27c2 Mon Sep 17 00:00:00 2001 From: DianQK Date: Thu, 11 Jul 2024 22:08:02 +0800 Subject: [PATCH 11/12] Add nikic's comment --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index ab01ef50c5e91..e5983e6b58e1b 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1163,6 +1163,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, auto *CopySource = MDep->getSource(); auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] { if (CopySource->use_empty()) + // Safety: It's safe here because we will only allocate more instructions + // after finishing all BatchAA queries, but we have to be careful if we + // want to do something like this in another place. Then we'd probably + // have to delay instruction removal until all transforms on an + // instruction finished. cast(CopySource)->eraseFromParent(); }); MaybeAlign CopySourceAlign = MDep->getSourceAlign(); From ba0188dfe81a4ff8defca6c0e182cc293a441781 Mon Sep 17 00:00:00 2001 From: DianQK Date: Fri, 12 Jul 2024 07:49:20 +0800 Subject: [PATCH 12/12] Only delete the instruction we created --- llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp | 15 +++++++++------ .../Transforms/MemCpyOpt/memcpy-memcpy-offset.ll | 1 + 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index e5983e6b58e1b..1c65219585e5a 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1161,14 +1161,15 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, IRBuilder<> Builder(M); auto *CopySource = MDep->getSource(); - auto CleanupOnFailure = llvm::make_scope_exit([&CopySource] { - if (CopySource->use_empty()) + Instruction *NewCopySource = nullptr; + auto CleanupOnRet = llvm::make_scope_exit([&NewCopySource] { + if (NewCopySource && NewCopySource->use_empty()) // Safety: It's safe here because we will only allocate more instructions // after finishing all BatchAA queries, but we have to be careful if we // want to do something like this in another place. Then we'd probably // have to delay instruction removal until all transforms on an // instruction finished. - cast(CopySource)->eraseFromParent(); + NewCopySource->eraseFromParent(); }); MaybeAlign CopySourceAlign = MDep->getSourceAlign(); // We just need to calculate the actual size of the copy. @@ -1186,9 +1187,11 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M, M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL); if (MDestOffset == MForwardOffset) CopySource = M->getDest(); - else - CopySource = Builder.CreateInBoundsPtrAdd( - CopySource, Builder.getInt64(MForwardOffset)); + else { + NewCopySource = cast(Builder.CreateInBoundsPtrAdd( + CopySource, Builder.getInt64(MForwardOffset))); + CopySource = NewCopySource; + } // We need to update `MCopyLoc` if an offset exists. MCopyLoc = MCopyLoc.getWithNewPtr(CopySource); if (CopySourceAlign) diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll index 07fc6880746ed..6abb0da827990 100644 --- a/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy-memcpy-offset.ll @@ -64,6 +64,7 @@ define void @forward_offset_without_gep(ptr %src) { ; CHECK-NEXT: [[TMP:%.*]] = alloca [9 x i8], align 1 ; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP]], ptr align 1 [[SRC]], i64 7, i1 false) ; CHECK-NEXT: [[TMP_OFFSET:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 1 +; CHECK-NEXT: [[DEST:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 ; CHECK-NEXT: ret void ; %cpy_tmp = alloca %buf, align 1