diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 605937503407a..907c4577d93d3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -138,6 +138,11 @@ static cl::opt<bool> EnableReduceLoadOpStoreWidth(
     "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
     cl::desc("DAG combiner enable reducing the width of load/op/store "
              "sequence"));
+static cl::opt<bool> ReduceLoadOpStoreWidthForceNarrowingProfitable(
+    "combiner-reduce-load-op-store-width-force-narrowing-profitable",
+    cl::Hidden, cl::init(false),
+    cl::desc("DAG combiner force override the narrowing profitable check when"
+             "reducing the width of load/op/store sequences"));
 
 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
     "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
@@ -20351,19 +20356,38 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     // The narrowing should be profitable, the load/store operation should be
     // legal (or custom) and the store size should be equal to the NewVT width.
-    while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
-                                !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
-                                !TLI.isNarrowingProfitable(N, VT, NewVT))) {
+    while (NewBW < BitWidth &&
+           (NewVT.getStoreSizeInBits() != NewBW ||
+            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+            (!ReduceLoadOpStoreWidthForceNarrowingProfitable &&
+             !TLI.isNarrowingProfitable(N, VT, NewVT)))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
     if (NewBW >= BitWidth)
       return SDValue();
 
-    // If the lsb changed does not start at the type bitwidth boundary,
-    // start at the previous one.
-    if (ShAmt % NewBW)
-      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
+    // TODO: For big-endian we probably want to align given the most significant
+    // bit being modified instead of adjusting ShAmt based on least significant
+    // bits. This to reduce the risk of failing on the alignment check below. If
+    // for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to
+    // find NewBW=16, and we want to load/store with a PtrOff set to 2. But then
+    // ShAmt should be set to 8, which isn't a multiple of NewBW.  But given
+    // that isNarrowingProfitable doesn't seem to be overridden for any in-tree
+    // big-endian target, then the support for big-endian here isn't covered by
+    // any in-tree lit tests, so it is unfortunately not highly optimized
+    // either. It should be possible to improve that by using
+    // ReduceLoadOpStoreWidthForceNarrowingProfitable.
+
+    // If the lsb that is modified does not start at the type bitwidth boundary,
+    // align to start at the previous boundary.
+    ShAmt = ShAmt - (ShAmt % NewBW);
+
+    // Make sure we do not access memory outside the memory touched by the
+    // original load/store.
+    if (ShAmt + NewBW > VT.getStoreSizeInBits())
+      return SDValue();
+
     APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
                                    std::min(BitWidth, ShAmt + NewBW));
     if ((Imm & Mask) == Imm) {
diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
new file mode 100644
index 0000000000000..efdfa10f7ca07
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple armv7 -O1 | FileCheck %s -check-prefix=CHECK-LE-NORMAL
+; RUN: llc < %s -mtriple armv7 -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-LE-NARROW
+; RUN: llc < %s -mtriple armv7eb -O1 | FileCheck %s -check-prefix=CHECK-BE-NORMAL
+; RUN: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW
+
+; This is a reproducer for a bug when DAGCombiner::ReduceLoadOpStoreWidth
+; would end up narrowing the load-op-store sequence into this SDNode sequence
+; for little-endian
+;
+;   t18: i32,ch = load<(load (s32) from %ir.p1 + 8, align 8)> t0, t17, undef:i32
+;   t20: i32 = or t18, Constant:i32<65534>
+;   t21: ch = store<(store (s32) into %ir.p1 + 8, align 8)> t18:1, t20, t17, undef:i32
+;
+; This was wrong since it accesses memory above %ir.p1+9 which is outside the
+; "store size" for the original store.
+;
+; For big-endian we used to hit an assertion due to passing a negative offset
+; to getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while
+; before that commit we got load/store instructions that accessed memory at a
+; negative offset from %p1).
+;
+define i16 @test(ptr %p1) {
+; CHECK-LE-NORMAL-LABEL: test:
+; CHECK-LE-NORMAL:       @ %bb.0: @ %entry
+; CHECK-LE-NORMAL-NEXT:    ldrh r1, [r0, #8]
+; CHECK-LE-NORMAL-NEXT:    movw r2, #65534
+; CHECK-LE-NORMAL-NEXT:    orr r1, r1, r2
+; CHECK-LE-NORMAL-NEXT:    strh r1, [r0, #8]
+; CHECK-LE-NORMAL-NEXT:    mov r0, #0
+; CHECK-LE-NORMAL-NEXT:    bx lr
+;
+; CHECK-LE-NARROW-LABEL: test:
+; CHECK-LE-NARROW:       @ %bb.0: @ %entry
+; CHECK-LE-NARROW-NEXT:    ldrh r1, [r0, #8]
+; CHECK-LE-NARROW-NEXT:    movw r2, #65534
+; CHECK-LE-NARROW-NEXT:    orr r1, r1, r2
+; CHECK-LE-NARROW-NEXT:    strh r1, [r0, #8]
+; CHECK-LE-NARROW-NEXT:    mov r0, #0
+; CHECK-LE-NARROW-NEXT:    bx lr
+;
+; CHECK-BE-NORMAL-LABEL: test:
+; CHECK-BE-NORMAL:       @ %bb.0: @ %entry
+; CHECK-BE-NORMAL-NEXT:    ldrh r1, [r0]
+; CHECK-BE-NORMAL-NEXT:    movw r2, #65534
+; CHECK-BE-NORMAL-NEXT:    orr r1, r1, r2
+; CHECK-BE-NORMAL-NEXT:    strh r1, [r0]
+; CHECK-BE-NORMAL-NEXT:    mov r0, #0
+; CHECK-BE-NORMAL-NEXT:    bx lr
+;
+; CHECK-BE-NARROW-LABEL: test:
+; CHECK-BE-NARROW:       @ %bb.0: @ %entry
+; CHECK-BE-NARROW-NEXT:    ldrh r1, [r0]
+; CHECK-BE-NARROW-NEXT:    movw r2, #65534
+; CHECK-BE-NARROW-NEXT:    orr r1, r1, r2
+; CHECK-BE-NARROW-NEXT:    strh r1, [r0]
+; CHECK-BE-NARROW-NEXT:    mov r0, #0
+; CHECK-BE-NARROW-NEXT:    bx lr
+entry:
+  %load = load i80, ptr %p1, align 32
+  %mask = shl i80 -1, 65
+  %op = or i80 %load, %mask
+  store i80 %op, ptr %p1, align 32
+  ret i16 0
+}
+
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll
index bd7068535eabc..2915d1a7d7ae1 100644
--- a/llvm/test/CodeGen/X86/store_op_load_fold.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll
@@ -23,7 +23,10 @@ define void @test2() nounwind uwtable ssp {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl L_s2$non_lazy_ptr, %eax
-; CHECK-NEXT:    andl $-262144, 20(%eax) ## imm = 0xFFFC0000
+; CHECK-NEXT:    movzbl 22(%eax), %ecx
+; CHECK-NEXT:    andl $-4, %ecx
+; CHECK-NEXT:    movb %cl, 22(%eax)
+; CHECK-NEXT:    movw $0, 20(%eax)
 ; CHECK-NEXT:    retl
   %bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
   %bf.clear36 = and i56 %bf.load35, -1125895611875329