diff --git a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
index 496d2958fc2d0..d1369ae918959 100644
--- a/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ b/llvm/include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H
 #define LLVM_TRANSFORMS_SCALAR_MEMCPYOPTIMIZER_H
 
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/PassManager.h"
 
@@ -64,21 +65,28 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
 private:
   // Helper functions
   bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
+  bool processLoad(LoadInst *LI, BasicBlock::iterator &BBI,
+                   SmallVectorImpl<Instruction *> &NewInsts);
   bool processStoreOfLoad(StoreInst *SI, LoadInst *LI, const DataLayout &DL,
                           BasicBlock::iterator &BBI);
   bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-  bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
+  bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI,
+                     SmallVectorImpl<Instruction *> &NewInsts);
   bool processMemMove(MemMoveInst *M, BasicBlock::iterator &BBI);
   bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
                             Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
                             Align cpyAlign, BatchAAResults &BAA,
                             std::function<CallInst *()> GetC);
   bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep,
-                                     BatchAAResults &BAA);
+                                     BatchAAResults &BAA,
+                                     SmallVectorImpl<Instruction *> &NewInsts);
   bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet,
                                      BatchAAResults &BAA);
   bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet,
                                   BatchAAResults &BAA);
+  bool findNewSrc(MemCpyInst *MDep, Instruction *UseInstr, BatchAAResults &BAA,
+                  Value *&NewSrc, MaybeAlign &NewAlign,
+                  SmallVectorImpl<Instruction *> &NewInsts);
   bool processByValArgument(CallBase &CB, unsigned ArgNo);
   bool processImmutArgument(CallBase &CB, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
@@ -90,7 +98,7 @@ class MemCpyOptPass : public PassInfoMixin<MemCpyOptPass> {
   bool isMemMoveMemSetDependency(MemMoveInst *M);
 
   void eraseInstruction(Instruction *I);
-  bool iterateOnFunction(Function &F);
+  bool iterateOnFunction(Function &F, SmallVectorImpl<Instruction *> &NewInsts);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 6407f48dc2c05..1dfa6bc787278 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -74,6 +74,7 @@ STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
 STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
 STATISTIC(NumStackMove, "Number of stack-move optimizations performed");
+STATISTIC(NumLoadInstr, "Number of load instruction optimizations performed");
 
 namespace {
 
@@ -739,6 +740,145 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   return false;
 }
 
+bool MemCpyOptPass::findNewSrc(MemCpyInst *MDep, Instruction *UseInstr,
+                               BatchAAResults &BAA, Value *&NewSrc,
+                               MaybeAlign &NewAlign,
+                               SmallVectorImpl<Instruction *> &NewInsts) {
+  auto *MemCpy = dyn_cast<MemCpyInst>(UseInstr);
+  auto *LoadI = dyn_cast<LoadInst>(UseInstr);
+  MemoryLocation UseLoc;
+  Value *OldSrc;
+  if (MemCpy) {
+    UseLoc = MemoryLocation::getForSource(MemCpy);
+    OldSrc = MemCpy->getSource();
+  } else if (LoadI) {
+    UseLoc = MemoryLocation::get(LoadI);
+    OldSrc = LoadI->getPointerOperand();
+  } else
+    return false;
+  uint64_t UseLen = 0;
+  if (UseLoc.Size.hasValue())
+    UseLen = UseLoc.Size.getValue().getKnownMinValue();
+  // If dep instruction is reading from our current input, then it is a noop
+  // transfer and substituting the input won't change this instruction. Just
+  // ignore the input and let someone else zap MDep. This handles cases like:
+  //    memcpy(a <- a)
+  //    memcpy(b <- a)
+  if (OldSrc == MDep->getSource())
+    return false;
+
+  // We can only optimize non-volatile memcpy's.
+  if (MDep->isVolatile())
+    return false;
+
+  int64_t MForwardOffset = 0;
+  const DataLayout &DL = MDep->getDataLayout();
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other, or they have an offset in a range.
+  if (OldSrc != MDep->getDest()) {
+    std::optional<int64_t> Offset =
+        OldSrc->getPointerOffsetFrom(MDep->getDest(), DL);
+    if (!Offset || *Offset < 0)
+      return false;
+    MForwardOffset = *Offset;
+  }
+
+  // The length of the memcpy's must be the same, or the preceding one
+  // must be larger than the following one.
+  if (MForwardOffset != 0 || LoadI ||
+      (MemCpy && MDep->getLength() != MemCpy->getLength())) {
+    auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+    if (UseLen == 0 || !MDepLen ||
+        MDepLen->getZExtValue() < UseLen + MForwardOffset)
+      return false;
+  }
+  IRBuilder<> Builder(UseInstr);
+  NewSrc = MDep->getSource();
+  NewAlign = MDep->getSourceAlign();
+  // We just need to calculate the actual size of the copy.
+  auto MCopyLoc =
+      MemoryLocation::getForSource(MDep).getWithNewSize(UseLoc.Size);
+
+  // When the forwarding offset is greater than 0, we transform
+  //    memcpy(d1 <- s1)
+  //    memcpy(d2 <- d1+o)
+  // to
+  //    memcpy(d2 <- s1+o)
+  if (MForwardOffset > 0) {
+    // The copy destination of `M` maybe can serve as the source of copying.
+    if (MemCpy && (MForwardOffset == MemCpy->getRawDest()->getPointerOffsetFrom(
+                                         MDep->getRawSource(), DL))) {
+      NewSrc = cast<MemCpyInst>(UseInstr)->getDest();
+    } else {
+      NewSrc = Builder.CreateInBoundsPtrAdd(NewSrc,
+                                            Builder.getInt64(MForwardOffset));
+      if (Instruction *NewI = dyn_cast<Instruction>(NewSrc))
+        NewInsts.push_back(NewI);
+    }
+    // We need to update `MCopyLoc` if an offset exists.
+    MCopyLoc = MCopyLoc.getWithNewPtr(NewSrc);
+    if (NewAlign)
+      NewAlign = commonAlignment(*NewAlign, MForwardOffset);
+  }
+
+  // Avoid infinite loops
+  if (BAA.isMustAlias(OldSrc, NewSrc))
+    return false;
+  // Verify that the copied-from memory doesn't change in between the two
+  // transfers.  For example, in:
+  //    memcpy(a <- b)
+  //    *b = 42;
+  //    memcpy(c <- a)
+  // It would be invalid to transform the second memcpy into memcpy(c <- b).
+  //
+  // TODO: If the code between M and MDep is transparent to the destination "c",
+  // then we could still perform the xform by moving M up to the first memcpy.
+  if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep),
+                     MSSA->getMemoryAccess(UseInstr)))
+    return false;
+  return true;
+}
+
+/// Perform simplification of load's. If we have memcpy A which copies X to Y,
+/// and load instruction B which loads from Y, then we can rewrite B to be a
+/// load instruction loads from X. This allows later passes to remove the memcpy
+/// A or identify the source of the load instruction.
+bool MemCpyOptPass::processLoad(LoadInst *LI, BasicBlock::iterator &BBI,
+                                SmallVectorImpl<Instruction *> &NewInsts) {
+  if (!LI->isSimple())
+    return false;
+  MemoryUseOrDef *MA = MSSA->getMemoryAccess(LI);
+  if (!MA)
+    return false;
+  BatchAAResults BAA(*AA, EEA);
+
+  MemoryAccess *AnyClobber = MA->getDefiningAccess();
+  const MemoryAccess *DestClobber =
+      MSSA->getWalker()->getClobberingMemoryAccess(
+          AnyClobber, MemoryLocation::get(LI), BAA);
+  MemCpyInst *MDep = nullptr;
+  if (auto *MD = dyn_cast<MemoryDef>(DestClobber))
+    if (Instruction *MI = MD->getMemoryInst())
+      MDep = dyn_cast<MemCpyInst>(MI);
+
+  if (!MDep)
+    return false;
+
+  Value *NewSrc;
+  MaybeAlign NewAlign;
+  if (!findNewSrc(MDep, LI, BAA, NewSrc, NewAlign, NewInsts))
+    return false;
+  IRBuilder<> Builder(LI);
+  Instruction *NewLI =
+      Builder.CreateAlignedLoad(LI->getType(), NewSrc, NewAlign, LI->getName());
+  auto *NewAccess = MSSAU->createMemoryAccessAfter(NewLI, nullptr, MA);
+  MSSAU->insertUse(cast<MemoryUse>(NewAccess), /*RenameUses=*/true);
+  LI->replaceAllUsesWith(NewLI);
+  eraseInstruction(LI);
+  ++NumLoadInstr;
+  return true;
+}
+
 bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
   if (!SI->isSimple())
     return false;
@@ -1101,101 +1241,18 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
 
 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is
 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
-bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
-                                                  MemCpyInst *MDep,
-                                                  BatchAAResults &BAA) {
-  // If dep instruction is reading from our current input, then it is a noop
-  // transfer and substituting the input won't change this instruction. Just
-  // ignore the input and let someone else zap MDep. This handles cases like:
-  //    memcpy(a <- a)
-  //    memcpy(b <- a)
-  if (M->getSource() == MDep->getSource())
-    return false;
-
-  // We can only optimize non-volatile memcpy's.
-  if (MDep->isVolatile())
+bool MemCpyOptPass::processMemCpyMemCpyDependence(
+    MemCpyInst *M, MemCpyInst *MDep, BatchAAResults &BAA,
+    SmallVectorImpl<Instruction *> &NewInsts) {
+  Value *NewSrc;
+  MaybeAlign NewAlign;
+  if (!findNewSrc(MDep, M, BAA, NewSrc, NewAlign, NewInsts))
     return false;
 
-  int64_t MForwardOffset = 0;
-  const DataLayout &DL = M->getModule()->getDataLayout();
-  // We can only transforms memcpy's where the dest of one is the source of the
-  // other, or they have an offset in a range.
-  if (M->getSource() != MDep->getDest()) {
-    std::optional<int64_t> Offset =
-        M->getSource()->getPointerOffsetFrom(MDep->getDest(), DL);
-    if (!Offset || *Offset < 0)
-      return false;
-    MForwardOffset = *Offset;
-  }
-
-  // The length of the memcpy's must be the same, or the preceding one
-  // must be larger than the following one.
-  if (MForwardOffset != 0 || MDep->getLength() != M->getLength()) {
-    auto *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
-    auto *MLen = dyn_cast<ConstantInt>(M->getLength());
-    if (!MDepLen || !MLen ||
-        MDepLen->getZExtValue() < MLen->getZExtValue() + MForwardOffset)
-      return false;
-  }
-
   IRBuilder<> Builder(M);
-  auto *CopySource = MDep->getSource();
-  Instruction *NewCopySource = nullptr;
-  auto CleanupOnRet = llvm::make_scope_exit([&] {
-    if (NewCopySource && NewCopySource->use_empty())
-      // Safety: It's safe here because we will only allocate more instructions
-      // after finishing all BatchAA queries, but we have to be careful if we
-      // want to do something like this in another place. Then we'd probably
-      // have to delay instruction removal until all transforms on an
-      // instruction finished.
-      eraseInstruction(NewCopySource);
-  });
-  MaybeAlign CopySourceAlign = MDep->getSourceAlign();
-  // We just need to calculate the actual size of the copy.
-  auto MCopyLoc = MemoryLocation::getForSource(MDep).getWithNewSize(
-      MemoryLocation::getForSource(M).Size);
-
-  // When the forwarding offset is greater than 0, we transform
-  //    memcpy(d1 <- s1)
-  //    memcpy(d2 <- d1+o)
-  // to
-  //    memcpy(d2 <- s1+o)
-  if (MForwardOffset > 0) {
-    // The copy destination of `M` maybe can serve as the source of copying.
-    std::optional<int64_t> MDestOffset =
-        M->getRawDest()->getPointerOffsetFrom(MDep->getRawSource(), DL);
-    if (MDestOffset == MForwardOffset)
-      CopySource = M->getDest();
-    else {
-      CopySource = Builder.CreateInBoundsPtrAdd(
-          CopySource, Builder.getInt64(MForwardOffset));
-      NewCopySource = dyn_cast<Instruction>(CopySource);
-    }
-    // We need to update `MCopyLoc` if an offset exists.
-    MCopyLoc = MCopyLoc.getWithNewPtr(CopySource);
-    if (CopySourceAlign)
-      CopySourceAlign = commonAlignment(*CopySourceAlign, MForwardOffset);
-  }
-
-  // Avoid infinite loops
-  if (BAA.isMustAlias(M->getSource(), CopySource))
-    return false;
-
-  // Verify that the copied-from memory doesn't change in between the two
-  // transfers.  For example, in:
-  //    memcpy(a <- b)
-  //    *b = 42;
-  //    memcpy(c <- a)
-  // It would be invalid to transform the second memcpy into memcpy(c <- b).
-  //
-  // TODO: If the code between M and MDep is transparent to the destination "c",
-  // then we could still perform the xform by moving M up to the first memcpy.
-  if (writtenBetween(MSSA, BAA, MCopyLoc, MSSA->getMemoryAccess(MDep),
-                     MSSA->getMemoryAccess(M)))
-    return false;
 
   // No need to create `memcpy(a <- a)`.
-  if (BAA.isMustAlias(M->getDest(), CopySource)) {
+  if (BAA.isMustAlias(M->getDest(), NewSrc)) {
     // Remove the instruction we're replacing.
     eraseInstruction(M);
     ++NumMemCpyInstr;
@@ -1226,20 +1283,18 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
   // example we could be moving from movaps -> movq on x86.
   Instruction *NewM;
   if (UseMemMove)
-    NewM =
-        Builder.CreateMemMove(M->getDest(), M->getDestAlign(), CopySource,
-                              CopySourceAlign, M->getLength(), M->isVolatile());
+    NewM = Builder.CreateMemMove(M->getDest(), M->getDestAlign(), NewSrc,
+                                 NewAlign, M->getLength(), M->isVolatile());
   else if (isa<MemCpyInlineInst>(M)) {
     // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is
     // never allowed since that would allow the latter to be lowered as a call
     // to an external function.
-    NewM = Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(),
-                                      CopySource, CopySourceAlign,
-                                      M->getLength(), M->isVolatile());
-  } else
     NewM =
-        Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), CopySource,
-                             CopySourceAlign, M->getLength(), M->isVolatile());
+        Builder.CreateMemCpyInline(M->getDest(), M->getDestAlign(), NewSrc,
+                                   NewAlign, M->getLength(), M->isVolatile());
+  } else
+    NewM = Builder.CreateMemCpy(M->getDest(), M->getDestAlign(), NewSrc,
+                                NewAlign, M->getLength(), M->isVolatile());
   NewM->copyMetadata(*M, LLVMContext::MD_DIAssignID);
 
   assert(isa<MemoryDef>(MSSA->getMemoryAccess(M)));
@@ -1703,7 +1758,8 @@ static bool isZeroSize(Value *Size) {
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
-bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI,
+                                  SmallVectorImpl<Instruction *> &NewInsts) {
   // We can only optimize non-volatile memcpy's.
   if (M->isVolatile())
     return false;
@@ -1791,7 +1847,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
         }
       }
       if (auto *MDep = dyn_cast<MemCpyInst>(MI))
-        if (processMemCpyMemCpyDependence(M, MDep, BAA))
+        if (processMemCpyMemCpyDependence(M, MDep, BAA, NewInsts))
           return true;
       if (auto *MDep = dyn_cast<MemSetInst>(MI)) {
         if (performMemCpyToMemSetOptzn(M, MDep, BAA)) {
@@ -2096,7 +2152,8 @@ bool MemCpyOptPass::processImmutArgument(CallBase &CB, unsigned ArgNo) {
 }
 
 /// Executes one iteration of MemCpyOptPass.
-bool MemCpyOptPass::iterateOnFunction(Function &F) {
+bool MemCpyOptPass::iterateOnFunction(
+    Function &F, SmallVectorImpl<Instruction *> &NewInsts) {
   bool MadeChange = false;
 
   // Walk all instruction in the function.
@@ -2114,12 +2171,14 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
 
       bool RepeatInstruction = false;
 
-      if (auto *SI = dyn_cast<StoreInst>(I))
+      if (auto *LI = dyn_cast<LoadInst>(I))
+        MadeChange |= processLoad(LI, BI, NewInsts);
+      else if (auto *SI = dyn_cast<StoreInst>(I))
         MadeChange |= processStore(SI, BI);
       else if (auto *M = dyn_cast<MemSetInst>(I))
         RepeatInstruction = processMemSet(M, BI);
       else if (auto *M = dyn_cast<MemCpyInst>(I))
-        RepeatInstruction = processMemCpy(M, BI);
+        RepeatInstruction = processMemCpy(M, BI, NewInsts);
       else if (auto *M = dyn_cast<MemMoveInst>(I))
         RepeatInstruction = processMemMove(M, BI);
       else if (auto *CB = dyn_cast<CallBase>(I)) {
@@ -2176,13 +2235,19 @@ bool MemCpyOptPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   MSSAU = &MSSAU_;
   EarliestEscapeAnalysis EEA_(*DT);
   EEA = &EEA_;
+  SmallVector<Instruction *, 4> NewInsts;
 
   while (true) {
-    if (!iterateOnFunction(F))
+    if (!iterateOnFunction(F, NewInsts))
       break;
     MadeChange = true;
   }
 
+  for (auto *I : NewInsts) {
+    if (I->use_empty())
+      eraseInstruction(I);
+  }
+
   if (VerifyMemorySSA)
     MSSA_->verifyMemorySSA();
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
new file mode 100644
index 0000000000000..462e03f22c2f1
--- /dev/null
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy-load.ll
@@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=memcpyopt -S -verify-memoryssa | FileCheck %s
+
+define i24 @forward_load(ptr %src) {
+; CHECK-LABEL: define i24 @forward_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false)
+; CHECK-NEXT:    [[VAL1:%.*]] = load i24, ptr [[SRC]], align 4
+; CHECK-NEXT:    ret i24 [[VAL1]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+  %val = load i24, ptr %dest
+  ret i24 %val
+}
+
+define i16 @forward_load_2(ptr %src) {
+; CHECK-LABEL: define i16 @forward_load_2(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 2, i1 false)
+; CHECK-NEXT:    [[VAL1:%.*]] = load i16, ptr [[SRC]], align 2
+; CHECK-NEXT:    ret i16 [[VAL1]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 2, i1 false)
+  %val = load i16, ptr %dest
+  ret i16 %val
+}
+
+define i32 @forward_load_padding(ptr %src) {
+; CHECK-LABEL: define i32 @forward_load_padding(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca { i8, i32 }, align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 8, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 4
+; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    ret i32 [[VAL1]]
+;
+  %dest = alloca { i8, i32 }
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 8, i1 false)
+  %gep = getelementptr inbounds i8, ptr %dest, i64 4
+  %val = load i32, ptr %gep
+  ret i32 %val
+}
+
+; Negative tests
+
+define i24 @failed_forward_load_write_src(ptr %src) {
+; CHECK-LABEL: define i24 @failed_forward_load_write_src(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 3, i1 false)
+; CHECK-NEXT:    store i1 true, ptr [[SRC]], align 1
+; CHECK-NEXT:    [[VAL:%.*]] = load i24, ptr [[DEST]], align 4
+; CHECK-NEXT:    ret i24 [[VAL]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 3, i1 false)
+  store i1 true, ptr %src
+  %val = load i24, ptr %dest
+  ret i24 %val
+}
+
+define i16 @failed_forward_load_size(ptr %src) {
+; CHECK-LABEL: define i16 @failed_forward_load_size(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 1, i1 false)
+; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[DEST]], align 2
+; CHECK-NEXT:    ret i16 [[VAL]]
+;
+  %dest = alloca [3 x i8]
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 1, i1 false)
+  %val = load i16, ptr %dest
+  ret i16 %val
+}
+
+define i32 @failed_forward_load_padding(ptr %src) {
+; CHECK-LABEL: define i32 @failed_forward_load_padding(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:    [[DEST:%.*]] = alloca { i8, i32 }, align 8
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DEST]], ptr [[SRC]], i64 5, i1 false)
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[DEST]], i64 4
+; CHECK-NEXT:    [[VAL:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    ret i32 [[VAL]]
+;
+  %dest = alloca { i8, i32 }
+  call void @llvm.memcpy.p0.p0.i64(ptr %dest, ptr %src, i64 5, i1 false)
+  %gep = getelementptr inbounds i8, ptr %dest, i64 4
+  %val = load i32, ptr %gep
+  ret i32 %val
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index 89d8eb1ee6711..066325086b7f0 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -229,10 +229,8 @@ define void @test4_write_between(ptr %P) {
 
 define i8 @test4_read_between(ptr %P) {
 ; CHECK-LABEL: @test4_read_between(
-; CHECK-NEXT:    [[A1:%.*]] = alloca [[TMP1:%.*]], align 8
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A1]], ptr align 4 [[P:%.*]], i64 8, i1 false)
-; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[A1]], align 1
-; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[P]])
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[A1:%.*]], align 4
+; CHECK-NEXT:    call void @test4a(ptr byval(i8) align 1 [[A1]])
 ; CHECK-NEXT:    ret i8 [[X]]
 ;
   %a1 = alloca %1
diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
index 5e13432746bf7..51689cc6fd452 100644
--- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -20,7 +20,7 @@ define i32 @foo(i1 %z) {
 ; CHECK-NEXT:    br label [[FOR_INC7_1]]
 ; CHECK:       for.inc7.1:
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[SCEVGEP]], i64 4, i1 false)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[SCEVGEP]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
new file mode 100644
index 0000000000000..006f15a31c4e1
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/pr137810-forward-load.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -O2 -S < %s | FileCheck %s
+
+; FIXME: It can return true.
+define i1 @main(ptr %i2) {
+; CHECK-LABEL: define noundef i1 @main(
+; CHECK-SAME: ptr captures(none) initializes((0, 3)) [[I2:%.*]]) local_unnamed_addr {
+; CHECK-NEXT:    [[I1:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    store i8 0, ptr [[I2]], align 1
+; CHECK-NEXT:    [[I3:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
+; CHECK-NEXT:    store i8 1, ptr [[I3]], align 1
+; CHECK-NEXT:    [[I4:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
+; CHECK-NEXT:    store i8 2, ptr [[I4]], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 3, ptr nonnull [[I1]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(3) [[I1]], ptr noundef nonnull align 1 dereferenceable(3) [[I2]], i64 3, i1 false)
+; CHECK-NEXT:    [[I51:%.*]] = load i8, ptr [[I2]], align 1
+; CHECK-NEXT:    [[I6:%.*]] = icmp eq i8 [[I51]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 1
+; CHECK-NEXT:    [[I82:%.*]] = load i8, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[I9:%.*]] = icmp eq i8 [[I82]], 1
+; CHECK-NEXT:    [[I10:%.*]] = select i1 [[I6]], i1 [[I9]], i1 false
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[I2]], i64 2
+; CHECK-NEXT:    [[I123:%.*]] = load i8, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[I13:%.*]] = icmp eq i8 [[I123]], 2
+; CHECK-NEXT:    [[I14:%.*]] = select i1 [[I10]], i1 [[I13]], i1 false
+; CHECK-NEXT:    br i1 [[I14]], label %[[TRUE:.*]], label %[[FALSE:.*]]
+; CHECK:       [[COMMON_RET:.*]]:
+; CHECK-NEXT:    ret i1 [[I14]]
+; CHECK:       [[TRUE]]:
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 3, ptr nonnull [[I1]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+; CHECK:       [[FALSE]]:
+; CHECK-NEXT:    call void @assert_failed(ptr nonnull [[I1]])
+; CHECK-NEXT:    br label %[[COMMON_RET]]
+;
+  %i1 = alloca [3 x i8], align 1
+  store i8 0, ptr %i2, align 1
+  %i3 = getelementptr inbounds nuw i8, ptr %i2, i64 1
+  store i8 1, ptr %i3, align 1
+  %i4 = getelementptr inbounds nuw i8, ptr %i2, i64 2
+  store i8 2, ptr %i4, align 1
+  call void @llvm.lifetime.start.p0(i64 3, ptr nonnull %i1)
+  call void @llvm.memcpy.p0.p0.i64(ptr %i1, ptr %i2, i64 3, i1 false)
+  %i5 = load i8, ptr %i1, align 1
+  %i6 = icmp eq i8 %i5, 0
+  %i7 = getelementptr inbounds nuw i8, ptr %i1, i64 1
+  %i8 = load i8, ptr %i7, align 1
+  %i9 = icmp eq i8 %i8, 1
+  %i10 = select i1 %i6, i1 %i9, i1 false
+  %i11 = getelementptr inbounds nuw i8, ptr %i1, i64 2
+  %i12 = load i8, ptr %i11, align 1
+  %i13 = icmp eq i8 %i12, 2
+  %i14 = select i1 %i10, i1 %i13, i1 false
+  br i1 %i14, label %true, label %false
+
+true:
+  call void @llvm.lifetime.end.p0(i64 3, ptr nonnull %i1)
+  ret i1 true
+
+false:
+  call void @assert_failed(ptr %i1)
+  ret i1 false
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
+declare void @llvm.lifetime.start.p0(i64, ptr)
+declare void @llvm.lifetime.end.p0(i64, ptr)
+declare void @assert_failed(ptr)