[DAGCombiner] Fix to avoid writing outside original store in ReduceLoadOpStoreWidth

bjope · bjope · commit 3a13d83e12d5 · 2024-12-08T19:28:10.000+01:00
DAGCombiner::ReduceLoadOpStoreWidth could replace memory accesses
with more narrow loads/store, although sometimes the new load/store
would touch memory outside the original object. That seemed wrong
and this patch is simply avoiding doing the DAG combine in such
situations.

We might wanna follow up with a patch that tries to align the memory
accesses differently (if allowed given the alignment setting), to
still do the transform in more situations. The current strategy for
deciding size and offset for the narrowed operations are a bit ad-hoc,
and specially for big-endian it seems to be poorly tuned in case a
target is sensitive to load/store alignments.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -20379,6 +20379,12 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     // If the lsb that is modified does not start at the type bitwidth boundary,
     // align to start at the previous boundary.
     ShAmt = ShAmt - (ShAmt % NewBW);
+
+    // Make sure we do not access memory outside the memory touched by the
+    // original load/store.
+    if (ShAmt + NewBW > VT.getStoreSizeInBits())
+      return SDValue();
+
     APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
                                    std::min(BitWidth, ShAmt + NewBW));
     if ((Imm & Mask) == Imm) {
diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple armv7 -O1 | FileCheck %s -check-prefix=CHECK-LE-NORMAL
 ; RUN: llc < %s -mtriple armv7 -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-LE-NARROW
 ; RUN: llc < %s -mtriple armv7eb -O1 | FileCheck %s -check-prefix=CHECK-BE-NORMAL
-; XXXRUNXXX: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW
+; RUN: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW
 
 ; This is a reproducer for a bug when DAGCombiner::ReduceLoadOpStoreWidth
 ; would end up narrowing the load-op-store sequence into this SDNode sequence
@@ -12,12 +12,12 @@
 ;   t20: i32 = or t18, Constant:i32<65534>
 ;   t21: ch = store<(store (s32) into %ir.p1 + 8, align 8)> t18:1, t20, t17, undef:i32
 ;
-; This is wrong since it accesses memory above %ir.p1+9 which is outside the
+; This was wrong since it accesses memory above %ir.p1+9 which is outside the
 ; "store size" for the original store.
 ;
-; For big-endian we hit an assertion due to passing a negative offset to
-; getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while before
-; that commit we got load/store instructions that accessed memory at a
+; For big-endian we used to hit an assertion due to passing a negative offset
+; to getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while
+; before that commit we got load/store instructions that accessed memory at a
 ; negative offset from %p1).
 ;
 define i16 @test(ptr %p1) {
@@ -32,10 +32,10 @@ define i16 @test(ptr %p1) {
 ;
 ; CHECK-LE-NARROW-LABEL: test:
 ; CHECK-LE-NARROW:       @ %bb.0: @ %entry
-; CHECK-LE-NARROW-NEXT:    ldr r1, [r0, #8]
+; CHECK-LE-NARROW-NEXT:    ldrh r1, [r0, #8]
 ; CHECK-LE-NARROW-NEXT:    movw r2, #65534
 ; CHECK-LE-NARROW-NEXT:    orr r1, r1, r2
-; CHECK-LE-NARROW-NEXT:    str r1, [r0, #8]
+; CHECK-LE-NARROW-NEXT:    strh r1, [r0, #8]
 ; CHECK-LE-NARROW-NEXT:    mov r0, #0
 ; CHECK-LE-NARROW-NEXT:    bx lr
 ;
@@ -47,6 +47,15 @@ define i16 @test(ptr %p1) {
 ; CHECK-BE-NORMAL-NEXT:    strh r1, [r0]
 ; CHECK-BE-NORMAL-NEXT:    mov r0, #0
 ; CHECK-BE-NORMAL-NEXT:    bx lr
+;
+; CHECK-BE-NARROW-LABEL: test:
+; CHECK-BE-NARROW:       @ %bb.0: @ %entry
+; CHECK-BE-NARROW-NEXT:    ldrh r1, [r0]
+; CHECK-BE-NARROW-NEXT:    movw r2, #65534
+; CHECK-BE-NARROW-NEXT:    orr r1, r1, r2
+; CHECK-BE-NARROW-NEXT:    strh r1, [r0]
+; CHECK-BE-NARROW-NEXT:    mov r0, #0
+; CHECK-BE-NARROW-NEXT:    bx lr
 entry:
   %load = load i80, ptr %p1, align 32
   %mask = shl i80 -1, 65
diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll
@@ -23,7 +23,10 @@ define void @test2() nounwind uwtable ssp {
 ; CHECK-LABEL: test2:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl L_s2$non_lazy_ptr, %eax
-; CHECK-NEXT:    andl $-262144, 20(%eax) ## imm = 0xFFFC0000
+; CHECK-NEXT:    movzbl 22(%eax), %ecx
+; CHECK-NEXT:    andl $-4, %ecx
+; CHECK-NEXT:    movb %cl, 22(%eax)
+; CHECK-NEXT:    movw $0, 20(%eax)
 ; CHECK-NEXT:    retl
   %bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16
   %bf.clear36 = and i56 %bf.load35, -1125895611875329