diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 605937503407a..907c4577d93d3 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -138,6 +138,11 @@ static cl::opt EnableReduceLoadOpStoreWidth( "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true), cl::desc("DAG combiner enable reducing the width of load/op/store " "sequence")); +static cl::opt ReduceLoadOpStoreWidthForceNarrowingProfitable( + "combiner-reduce-load-op-store-width-force-narrowing-profitable", + cl::Hidden, cl::init(false), + cl::desc("DAG combiner force override the narrowing profitable check when" + "reducing the width of load/op/store sequences")); static cl::opt EnableShrinkLoadReplaceStoreWithStore( "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true), @@ -20351,19 +20356,38 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); // The narrowing should be profitable, the load/store operation should be // legal (or custom) and the store size should be equal to the NewVT width. - while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW || - !TLI.isOperationLegalOrCustom(Opc, NewVT) || - !TLI.isNarrowingProfitable(N, VT, NewVT))) { + while (NewBW < BitWidth && + (NewVT.getStoreSizeInBits() != NewBW || + !TLI.isOperationLegalOrCustom(Opc, NewVT) || + (!ReduceLoadOpStoreWidthForceNarrowingProfitable && + !TLI.isNarrowingProfitable(N, VT, NewVT)))) { NewBW = NextPowerOf2(NewBW); NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); } if (NewBW >= BitWidth) return SDValue(); - // If the lsb changed does not start at the type bitwidth boundary, - // start at the previous one. - if (ShAmt % NewBW) - ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; + // TODO: For big-endian we probably want to align given the most significant + // bit being modified instead of adjusting ShAmt based on least significant + // bits. This to reduce the risk of failing on the alignment check below. If + // for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to + // find NewBW=16, and we want to load/store with a PtrOff set to 2. But then + // ShAmt should be set to 8, which isn't a multiple of NewBW. But given + // that isNarrowingProfitable doesn't seem to be overridden for any in-tree + // big-endian target, then the support for big-endian here isn't covered by + // any in-tree lit tests, so it is unfortunately not highly optimized + // either. It should be possible to improve that by using + // ReduceLoadOpStoreWidthForceNarrowingProfitable. + + // If the lsb that is modified does not start at the type bitwidth boundary, + // align to start at the previous boundary. + ShAmt = ShAmt - (ShAmt % NewBW); + + // Make sure we do not access memory outside the memory touched by the + // original load/store. + if (ShAmt + NewBW > VT.getStoreSizeInBits()) + return SDValue(); + APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, std::min(BitWidth, ShAmt + NewBW)); if ((Imm & Mask) == Imm) { diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll new file mode 100644 index 0000000000000..efdfa10f7ca07 --- /dev/null +++ b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple armv7 -O1 | FileCheck %s -check-prefix=CHECK-LE-NORMAL +; RUN: llc < %s -mtriple armv7 -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-LE-NARROW +; RUN: llc < %s -mtriple armv7eb -O1 | FileCheck %s -check-prefix=CHECK-BE-NORMAL +; RUN: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW + +; This is a reproducer for a bug when DAGCombiner::ReduceLoadOpStoreWidth +; would end up narrowing the load-op-store sequence into this SDNode sequence +; for little-endian +; +; t18: i32,ch = load<(load (s32) from %ir.p1 + 8, align 8)> t0, t17, undef:i32 +; t20: i32 = or t18, Constant:i32<65534> +; t21: ch = store<(store (s32) into %ir.p1 + 8, align 8)> t18:1, t20, t17, undef:i32 +; +; This was wrong since it accesses memory above %ir.p1+9 which is outside the +; "store size" for the original store. +; +; For big-endian we used to hit an assertion due to passing a negative offset +; to getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while +; before that commit we got load/store instructions that accessed memory at a +; negative offset from %p1). +; +define i16 @test(ptr %p1) { +; CHECK-LE-NORMAL-LABEL: test: +; CHECK-LE-NORMAL: @ %bb.0: @ %entry +; CHECK-LE-NORMAL-NEXT: ldrh r1, [r0, #8] +; CHECK-LE-NORMAL-NEXT: movw r2, #65534 +; CHECK-LE-NORMAL-NEXT: orr r1, r1, r2 +; CHECK-LE-NORMAL-NEXT: strh r1, [r0, #8] +; CHECK-LE-NORMAL-NEXT: mov r0, #0 +; CHECK-LE-NORMAL-NEXT: bx lr +; +; CHECK-LE-NARROW-LABEL: test: +; CHECK-LE-NARROW: @ %bb.0: @ %entry +; CHECK-LE-NARROW-NEXT: ldrh r1, [r0, #8] +; CHECK-LE-NARROW-NEXT: movw r2, #65534 +; CHECK-LE-NARROW-NEXT: orr r1, r1, r2 +; CHECK-LE-NARROW-NEXT: strh r1, [r0, #8] +; CHECK-LE-NARROW-NEXT: mov r0, #0 +; CHECK-LE-NARROW-NEXT: bx lr +; +; CHECK-BE-NORMAL-LABEL: test: +; CHECK-BE-NORMAL: @ %bb.0: @ %entry +; CHECK-BE-NORMAL-NEXT: ldrh r1, [r0] +; CHECK-BE-NORMAL-NEXT: movw r2, #65534 +; CHECK-BE-NORMAL-NEXT: orr r1, r1, r2 +; CHECK-BE-NORMAL-NEXT: strh r1, [r0] +; CHECK-BE-NORMAL-NEXT: mov r0, #0 +; CHECK-BE-NORMAL-NEXT: bx lr +; +; CHECK-BE-NARROW-LABEL: test: +; CHECK-BE-NARROW: @ %bb.0: @ %entry +; CHECK-BE-NARROW-NEXT: ldrh r1, [r0] +; CHECK-BE-NARROW-NEXT: movw r2, #65534 +; CHECK-BE-NARROW-NEXT: orr r1, r1, r2 +; CHECK-BE-NARROW-NEXT: strh r1, [r0] +; CHECK-BE-NARROW-NEXT: mov r0, #0 +; CHECK-BE-NARROW-NEXT: bx lr +entry: + %load = load i80, ptr %p1, align 32 + %mask = shl i80 -1, 65 + %op = or i80 %load, %mask + store i80 %op, ptr %p1, align 32 + ret i16 0 +} + diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll index bd7068535eabc..2915d1a7d7ae1 100644 --- a/llvm/test/CodeGen/X86/store_op_load_fold.ll +++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll @@ -23,7 +23,10 @@ define void @test2() nounwind uwtable ssp { ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl L_s2$non_lazy_ptr, %eax -; CHECK-NEXT: andl $-262144, 20(%eax) ## imm = 0xFFFC0000 +; CHECK-NEXT: movzbl 22(%eax), %ecx +; CHECK-NEXT: andl $-4, %ecx +; CHECK-NEXT: movb %cl, 22(%eax) +; CHECK-NEXT: movw $0, 20(%eax) ; CHECK-NEXT: retl %bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16 %bf.clear36 = and i56 %bf.load35, -1125895611875329