diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1aab0fce4e1e5..f14f6903232d1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -138,6 +138,11 @@ static cl::opt<bool> EnableReduceLoadOpStoreWidth(
     "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
     cl::desc("DAG combiner enable reducing the width of load/op/store "
              "sequence"));
+static cl::opt<bool> ReduceLoadOpStoreWidthForceNarrowingProfitable(
+    "combiner-reduce-load-op-store-width-force-narrowing-profitable",
+    cl::Hidden, cl::init(false),
+    cl::desc("DAG combiner force override the narrowing profitable check when"
+             "reducing the width of load/op/store sequences"));
 
 static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
     "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
@@ -20334,7 +20339,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
         ST->getPointerInfo().getAddrSpace())
       return SDValue();
 
-    // Find the type to narrow it the load / op / store to.
+    // Find the type NewVT to narrow the load / op / store to.
     SDValue N1 = Value.getOperand(1);
     unsigned BitWidth = N1.getValueSizeInBits();
     APInt Imm = N1->getAsAPIntVal();
@@ -20342,66 +20347,90 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
       Imm ^= APInt::getAllOnes(BitWidth);
     if (Imm == 0 || Imm.isAllOnes())
       return SDValue();
-    unsigned ShAmt = Imm.countr_zero();
-    unsigned MSB = BitWidth - Imm.countl_zero() - 1;
-    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    // Find least/most significant bit that need to be part of the narrowed
+    // operation. We assume target will need to address/access full bytes, so
+    // we make sure to align LSB and MSB at byte boundaries.
+    unsigned BitsPerByteMask = 7u;
+    unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
+    unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
+    unsigned NewBW = NextPowerOf2(MSB - LSB);
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     // The narrowing should be profitable, the load/store operation should be
     // legal (or custom) and the store size should be equal to the NewVT width.
-    while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW ||
-                                !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
-                                !TLI.isNarrowingProfitable(N, VT, NewVT))) {
+    while (NewBW < BitWidth &&
+           (NewVT.getStoreSizeInBits() != NewBW ||
+            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+            !(TLI.isNarrowingProfitable(N, VT, NewVT) ||
+              ReduceLoadOpStoreWidthForceNarrowingProfitable))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
     if (NewBW >= BitWidth)
       return SDValue();
 
-    // If the lsb changed does not start at the type bitwidth boundary,
-    // start at the previous one.
-    if (ShAmt % NewBW)
-      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
-    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
-                                   std::min(BitWidth, ShAmt + NewBW));
-    if ((Imm & Mask) == Imm) {
-      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
-      if (Opc == ISD::AND)
-        NewImm ^= APInt::getAllOnes(NewBW);
-      uint64_t PtrOff = ShAmt / 8;
-      // For big endian targets, we need to adjust the offset to the pointer to
-      // load the correct bytes.
-      if (DAG.getDataLayout().isBigEndian())
-        PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+    // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
+    // large enough to cover all bits that should be modified. This type might
+    // however be larger than really needed (such as i32 while we actually only
+    // need to modify one byte). Now we need to find our how to align the memory
+    // accesses to satisfy preferred alignments as well as avoiding to access
+    // memory outside the store size of the orignal access.
+
+    unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();
+
+    // Let ShAmt denote amount of bits to skip, counted from the least
+    // significant bits of Imm. And let PtrOff how much the pointer needs to be
+    // offsetted (in bytes) for the new access.
+    unsigned ShAmt = 0;
+    uint64_t PtrOff = 0;
+    for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
+      // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
+      if (ShAmt > LSB)
+        return SDValue();
+      if (ShAmt + NewBW < MSB)
+        continue;
+
+      // Calculate PtrOff.
+      unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
+                                         ? VTStoreSize - NewBW - ShAmt
+                                         : ShAmt;
+      PtrOff = PtrAdjustmentInBits / 8;
 
+      // Now check if narrow access is allowed and fast, considering alignments.
       unsigned IsFast = 0;
       Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
-                                  LD->getAddressSpace(), NewAlign,
-                                  LD->getMemOperand()->getFlags(), &IsFast) ||
-          !IsFast)
-        return SDValue();
-
-      SDValue NewPtr =
-          DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
-      SDValue NewLD =
-          DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
-                      LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
-                      LD->getMemOperand()->getFlags(), LD->getAAInfo());
-      SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
-                                   DAG.getConstant(NewImm, SDLoc(Value),
-                                                   NewVT));
-      SDValue NewST =
-          DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
-                       ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
-
-      AddToWorklist(NewPtr.getNode());
-      AddToWorklist(NewLD.getNode());
-      AddToWorklist(NewVal.getNode());
-      WorklistRemover DeadNodes(*this);
-      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
-      ++OpsNarrowed;
-      return NewST;
+      if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
+                                 LD->getAddressSpace(), NewAlign,
+                                 LD->getMemOperand()->getFlags(), &IsFast) &&
+          IsFast)
+        break;
     }
+    // If loop above did not find any accepted ShAmt we need to exit here.
+    if (ShAmt + NewBW > VTStoreSize)
+      return SDValue();
+
+    APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
+    if (Opc == ISD::AND)
+      NewImm ^= APInt::getAllOnes(NewBW);
+    Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
+    SDValue NewPtr =
+        DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
+    SDValue NewLD =
+        DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
+                    LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                    LD->getMemOperand()->getFlags(), LD->getAAInfo());
+    SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
+                                 DAG.getConstant(NewImm, SDLoc(Value), NewVT));
+    SDValue NewST =
+        DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
+                     ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
+
+    AddToWorklist(NewPtr.getNode());
+    AddToWorklist(NewLD.getNode());
+    AddToWorklist(NewVal.getNode());
+    WorklistRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
+    ++OpsNarrowed;
+    return NewST;
   }
 
   return SDValue();
diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
new file mode 100644
index 0000000000000..6819af3ba3e26
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple armv7 -O1 | FileCheck %s -check-prefix=CHECK-LE-NORMAL
+; RUN: llc < %s -mtriple armv7 -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-LE-NARROW
+; RUN: llc < %s -mtriple armv7eb -O1 | FileCheck %s -check-prefix=CHECK-BE-NORMAL
+; RUN: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW
+
+; This is a reproducer for a bug when DAGCombiner::ReduceLoadOpStoreWidth
+; would end up narrowing the load-op-store sequence into this SDNode sequence
+; for little-endian
+;
+;   t18: i32,ch = load<(load (s32) from %ir.p1 + 8, align 8)> t0, t17, undef:i32
+;   t20: i32 = or t18, Constant:i32<65534>
+;   t21: ch = store<(store (s32) into %ir.p1 + 8, align 8)> t18:1, t20, t17, undef:i32
+;
+; This was wrong since it accesses memory above %ir.p1+9 which is outside the
+; "store size" for the original store.
+;
+; For big-endian we used to hit an assertion due to passing a negative offset
+; to getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while
+; before that commit we got load/store instructions that accessed memory at a
+; negative offset from %p1).
+;
+define i16 @test(ptr %p1) {
+; CHECK-LE-NORMAL-LABEL: test:
+; CHECK-LE-NORMAL:       @ %bb.0: @ %entry
+; CHECK-LE-NORMAL-NEXT:    ldrh r1, [r0, #8]
+; CHECK-LE-NORMAL-NEXT:    movw r2, #65534
+; CHECK-LE-NORMAL-NEXT:    orr r1, r1, r2
+; CHECK-LE-NORMAL-NEXT:    strh r1, [r0, #8]
+; CHECK-LE-NORMAL-NEXT:    mov r0, #0
+; CHECK-LE-NORMAL-NEXT:    bx lr
+;
+; CHECK-LE-NARROW-LABEL: test:
+; CHECK-LE-NARROW:       @ %bb.0: @ %entry
+; CHECK-LE-NARROW-NEXT:    ldr r1, [r0, #6]
+; CHECK-LE-NARROW-NEXT:    orr r1, r1, #16646144
+; CHECK-LE-NARROW-NEXT:    orr r1, r1, #-16777216
+; CHECK-LE-NARROW-NEXT:    str r1, [r0, #6]
+; CHECK-LE-NARROW-NEXT:    mov r0, #0
+; CHECK-LE-NARROW-NEXT:    bx lr
+;
+; CHECK-BE-NORMAL-LABEL: test:
+; CHECK-BE-NORMAL:       @ %bb.0: @ %entry
+; CHECK-BE-NORMAL-NEXT:    ldrh r1, [r0]
+; CHECK-BE-NORMAL-NEXT:    movw r2, #65534
+; CHECK-BE-NORMAL-NEXT:    orr r1, r1, r2
+; CHECK-BE-NORMAL-NEXT:    strh r1, [r0]
+; CHECK-BE-NORMAL-NEXT:    mov r0, #0
+; CHECK-BE-NORMAL-NEXT:    bx lr
+;
+; CHECK-BE-NARROW-LABEL: test:
+; CHECK-BE-NARROW:       @ %bb.0: @ %entry
+; CHECK-BE-NARROW-NEXT:    ldr r1, [r0]
+; CHECK-BE-NARROW-NEXT:    orr r1, r1, #16646144
+; CHECK-BE-NARROW-NEXT:    orr r1, r1, #-16777216
+; CHECK-BE-NARROW-NEXT:    str r1, [r0]
+; CHECK-BE-NARROW-NEXT:    mov r0, #0
+; CHECK-BE-NARROW-NEXT:    bx lr
+entry:
+  %load = load i80, ptr %p1, align 32
+  %mask = shl i80 -1, 65
+  %op = or i80 %load, %mask
+  store i80 %op, ptr %p1, align 32
+  ret i16 0
+}
+
diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index e51ba9d9bf039..995946c007dca 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -906,13 +906,13 @@ entry:
 define void @or64mi_legacy(ptr %a) {
 ; CHECK-LABEL: or64mi_legacy:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; CHECK-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or64mi_legacy:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00]
+; NF-NEXT:    orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00]
 ; NF-NEXT:    # imm = 0x1E240
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
index 7fb07c6b3163e..338163d0e1cff 100644
--- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
+++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -6,22 +6,12 @@ define void @i24_or(ptr %a) {
 ; X86-LABEL: i24_or:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl (%eax), %ecx
-; X86-NEXT:    movzbl 2(%eax), %edx
-; X86-NEXT:    shll $16, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl $384, %edx # imm = 0x180
-; X86-NEXT:    movw %dx, (%eax)
+; X86-NEXT:    orw $384, (%eax) # imm = 0x180
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i24_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl (%rdi), %eax
-; X64-NEXT:    movzbl 2(%rdi), %ecx
-; X64-NEXT:    shll $16, %ecx
-; X64-NEXT:    orl %eax, %ecx
-; X64-NEXT:    orl $384, %ecx # imm = 0x180
-; X64-NEXT:    movw %cx, (%rdi)
+; X64-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; X64-NEXT:    retq
   %aa = load i24, ptr %a, align 1
   %b = or i24 %aa, 384
@@ -103,12 +93,12 @@ define void @i56_or(ptr %a) {
 ; X86-LABEL: i56_or:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl $384, (%eax) # imm = 0x180
+; X86-NEXT:    orw $384, (%eax) # imm = 0x180
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: i56_or:
 ; X64:       # %bb.0:
-; X64-NEXT:    orl $384, (%rdi) # imm = 0x180
+; X64-NEXT:    orw $384, (%rdi) # imm = 0x180
 ; X64-NEXT:    retq
   %aa = load i56, ptr %a, align 1
   %b = or i56 %aa, 384
diff --git a/llvm/test/CodeGen/X86/pr35763.ll b/llvm/test/CodeGen/X86/pr35763.ll
index 9d2ee84cf675b..c10a24cffe5f2 100644
--- a/llvm/test/CodeGen/X86/pr35763.ll
+++ b/llvm/test/CodeGen/X86/pr35763.ll
@@ -14,15 +14,7 @@ define dso_local void @PR35763() {
 ; CHECK-NEXT:    movzwl z+2(%rip), %ecx
 ; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    movq %rcx, tf_3_var_136(%rip)
-; CHECK-NEXT:    movl z+6(%rip), %eax
-; CHECK-NEXT:    movzbl z+10(%rip), %ecx
-; CHECK-NEXT:    shlq $32, %rcx
-; CHECK-NEXT:    orq %rax, %rcx
-; CHECK-NEXT:    movabsq $1090921758719, %rax # imm = 0xFE0000FFFF
-; CHECK-NEXT:    andq %rcx, %rax
-; CHECK-NEXT:    movl %eax, z+6(%rip)
-; CHECK-NEXT:    shrq $32, %rax
-; CHECK-NEXT:    movb %al, z+10(%rip)
+; CHECK-NEXT:    andl $-33554177, z+7(%rip) # imm = 0xFE0000FF
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i16, ptr @z, align 8
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll
index bd7068535eabc..0309fa7bb0310 100644
--- a/llvm/test/CodeGen/X86/store_op_load_fold.ll
+++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll
@@ -23,7 +23,7 @@ define void @test2() nounwind uwtable ssp {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl L_s2$non_lazy_ptr, %eax
-; CHECK-NEXT:    andl $-262144, 20(%eax) ## imm = 0xFFFC0000
+; CHECK-NEXT:    andl $-67108609, 19(%eax) ## imm = 0xFC0000FF
 ; CHECK-NEXT:    retl
   %bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
   %bf.clear36 = and i56 %bf.load35, -1125895611875329