diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1aab0fce4e1e5..f14f6903232d1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -138,6 +138,11 @@ static cl::opt EnableReduceLoadOpStoreWidth( "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true), cl::desc("DAG combiner enable reducing the width of load/op/store " "sequence")); +static cl::opt ReduceLoadOpStoreWidthForceNarrowingProfitable( + "combiner-reduce-load-op-store-width-force-narrowing-profitable", + cl::Hidden, cl::init(false), + cl::desc("DAG combiner force override the narrowing profitable check when" + "reducing the width of load/op/store sequences")); static cl::opt EnableShrinkLoadReplaceStoreWithStore( "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true), @@ -20334,7 +20339,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { ST->getPointerInfo().getAddrSpace()) return SDValue(); - // Find the type to narrow it the load / op / store to. + // Find the type NewVT to narrow the load / op / store to. SDValue N1 = Value.getOperand(1); unsigned BitWidth = N1.getValueSizeInBits(); APInt Imm = N1->getAsAPIntVal(); @@ -20342,66 +20347,90 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { Imm ^= APInt::getAllOnes(BitWidth); if (Imm == 0 || Imm.isAllOnes()) return SDValue(); - unsigned ShAmt = Imm.countr_zero(); - unsigned MSB = BitWidth - Imm.countl_zero() - 1; - unsigned NewBW = NextPowerOf2(MSB - ShAmt); + // Find least/most significant bit that need to be part of the narrowed + // operation. We assume target will need to address/access full bytes, so + // we make sure to align LSB and MSB at byte boundaries. + unsigned BitsPerByteMask = 7u; + unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask; + unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask; + unsigned NewBW = NextPowerOf2(MSB - LSB); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); // The narrowing should be profitable, the load/store operation should be // legal (or custom) and the store size should be equal to the NewVT width. - while (NewBW < BitWidth && (NewVT.getStoreSizeInBits() != NewBW || - !TLI.isOperationLegalOrCustom(Opc, NewVT) || - !TLI.isNarrowingProfitable(N, VT, NewVT))) { + while (NewBW < BitWidth && + (NewVT.getStoreSizeInBits() != NewBW || + !TLI.isOperationLegalOrCustom(Opc, NewVT) || + !(TLI.isNarrowingProfitable(N, VT, NewVT) || + ReduceLoadOpStoreWidthForceNarrowingProfitable))) { NewBW = NextPowerOf2(NewBW); NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); } if (NewBW >= BitWidth) return SDValue(); - // If the lsb changed does not start at the type bitwidth boundary, - // start at the previous one. - if (ShAmt % NewBW) - ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW; - APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, - std::min(BitWidth, ShAmt + NewBW)); - if ((Imm & Mask) == Imm) { - APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); - if (Opc == ISD::AND) - NewImm ^= APInt::getAllOnes(NewBW); - uint64_t PtrOff = ShAmt / 8; - // For big endian targets, we need to adjust the offset to the pointer to - // load the correct bytes. - if (DAG.getDataLayout().isBigEndian()) - PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; + // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is + // large enough to cover all bits that should be modified. This type might + // however be larger than really needed (such as i32 while we actually only + // need to modify one byte). Now we need to find our how to align the memory + // accesses to satisfy preferred alignments as well as avoiding to access + // memory outside the store size of the orignal access. + + unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue(); + + // Let ShAmt denote amount of bits to skip, counted from the least + // significant bits of Imm. And let PtrOff how much the pointer needs to be + // offsetted (in bytes) for the new access. + unsigned ShAmt = 0; + uint64_t PtrOff = 0; + for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) { + // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB. + if (ShAmt > LSB) + return SDValue(); + if (ShAmt + NewBW < MSB) + continue; + + // Calculate PtrOff. + unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian() + ? VTStoreSize - NewBW - ShAmt + : ShAmt; + PtrOff = PtrAdjustmentInBits / 8; + // Now check if narrow access is allowed and fast, considering alignments. unsigned IsFast = 0; Align NewAlign = commonAlignment(LD->getAlign(), PtrOff); - if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT, - LD->getAddressSpace(), NewAlign, - LD->getMemOperand()->getFlags(), &IsFast) || - !IsFast) - return SDValue(); - - SDValue NewPtr = - DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD)); - SDValue NewLD = - DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, - LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, - LD->getMemOperand()->getFlags(), LD->getAAInfo()); - SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, - DAG.getConstant(NewImm, SDLoc(Value), - NewVT)); - SDValue NewST = - DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, - ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); - - AddToWorklist(NewPtr.getNode()); - AddToWorklist(NewLD.getNode()); - AddToWorklist(NewVal.getNode()); - WorklistRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); - ++OpsNarrowed; - return NewST; + if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT, + LD->getAddressSpace(), NewAlign, + LD->getMemOperand()->getFlags(), &IsFast) && + IsFast) + break; } + // If loop above did not find any accepted ShAmt we need to exit here. + if (ShAmt + NewBW > VTStoreSize) + return SDValue(); + + APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW); + if (Opc == ISD::AND) + NewImm ^= APInt::getAllOnes(NewBW); + Align NewAlign = commonAlignment(LD->getAlign(), PtrOff); + SDValue NewPtr = + DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD)); + SDValue NewLD = + DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, + LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, + LD->getMemOperand()->getFlags(), LD->getAAInfo()); + SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, + DAG.getConstant(NewImm, SDLoc(Value), NewVT)); + SDValue NewST = + DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, + ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); + + AddToWorklist(NewPtr.getNode()); + AddToWorklist(NewLD.getNode()); + AddToWorklist(NewVal.getNode()); + WorklistRemover DeadNodes(*this); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); + ++OpsNarrowed; + return NewST; } return SDValue(); diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll new file mode 100644 index 0000000000000..6819af3ba3e26 --- /dev/null +++ b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple armv7 -O1 | FileCheck %s -check-prefix=CHECK-LE-NORMAL +; RUN: llc < %s -mtriple armv7 -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-LE-NARROW +; RUN: llc < %s -mtriple armv7eb -O1 | FileCheck %s -check-prefix=CHECK-BE-NORMAL +; RUN: llc < %s -mtriple armv7eb -O1 -combiner-reduce-load-op-store-width-force-narrowing-profitable=1 | FileCheck %s -check-prefix=CHECK-BE-NARROW + +; This is a reproducer for a bug when DAGCombiner::ReduceLoadOpStoreWidth +; would end up narrowing the load-op-store sequence into this SDNode sequence +; for little-endian +; +; t18: i32,ch = load<(load (s32) from %ir.p1 + 8, align 8)> t0, t17, undef:i32 +; t20: i32 = or t18, Constant:i32<65534> +; t21: ch = store<(store (s32) into %ir.p1 + 8, align 8)> t18:1, t20, t17, undef:i32 +; +; This was wrong since it accesses memory above %ir.p1+9 which is outside the +; "store size" for the original store. +; +; For big-endian we used to hit an assertion due to passing a negative offset +; to getMemBasePlusOffset (at least after commit 3e1b55cafc95d4ef4, while +; before that commit we got load/store instructions that accessed memory at a +; negative offset from %p1). +; +define i16 @test(ptr %p1) { +; CHECK-LE-NORMAL-LABEL: test: +; CHECK-LE-NORMAL: @ %bb.0: @ %entry +; CHECK-LE-NORMAL-NEXT: ldrh r1, [r0, #8] +; CHECK-LE-NORMAL-NEXT: movw r2, #65534 +; CHECK-LE-NORMAL-NEXT: orr r1, r1, r2 +; CHECK-LE-NORMAL-NEXT: strh r1, [r0, #8] +; CHECK-LE-NORMAL-NEXT: mov r0, #0 +; CHECK-LE-NORMAL-NEXT: bx lr +; +; CHECK-LE-NARROW-LABEL: test: +; CHECK-LE-NARROW: @ %bb.0: @ %entry +; CHECK-LE-NARROW-NEXT: ldr r1, [r0, #6] +; CHECK-LE-NARROW-NEXT: orr r1, r1, #16646144 +; CHECK-LE-NARROW-NEXT: orr r1, r1, #-16777216 +; CHECK-LE-NARROW-NEXT: str r1, [r0, #6] +; CHECK-LE-NARROW-NEXT: mov r0, #0 +; CHECK-LE-NARROW-NEXT: bx lr +; +; CHECK-BE-NORMAL-LABEL: test: +; CHECK-BE-NORMAL: @ %bb.0: @ %entry +; CHECK-BE-NORMAL-NEXT: ldrh r1, [r0] +; CHECK-BE-NORMAL-NEXT: movw r2, #65534 +; CHECK-BE-NORMAL-NEXT: orr r1, r1, r2 +; CHECK-BE-NORMAL-NEXT: strh r1, [r0] +; CHECK-BE-NORMAL-NEXT: mov r0, #0 +; CHECK-BE-NORMAL-NEXT: bx lr +; +; CHECK-BE-NARROW-LABEL: test: +; CHECK-BE-NARROW: @ %bb.0: @ %entry +; CHECK-BE-NARROW-NEXT: ldr r1, [r0] +; CHECK-BE-NARROW-NEXT: orr r1, r1, #16646144 +; CHECK-BE-NARROW-NEXT: orr r1, r1, #-16777216 +; CHECK-BE-NARROW-NEXT: str r1, [r0] +; CHECK-BE-NARROW-NEXT: mov r0, #0 +; CHECK-BE-NARROW-NEXT: bx lr +entry: + %load = load i80, ptr %p1, align 32 + %mask = shl i80 -1, 65 + %op = or i80 %load, %mask + store i80 %op, ptr %p1, align 32 + ret i16 0 +} + diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll index e51ba9d9bf039..995946c007dca 100644 --- a/llvm/test/CodeGen/X86/apx/or.ll +++ b/llvm/test/CodeGen/X86/apx/or.ll @@ -906,13 +906,13 @@ entry: define void @or64mi_legacy(ptr %a) { ; CHECK-LABEL: or64mi_legacy: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: or64mi_legacy: ; NF: # %bb.0: # %entry -; NF-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00] +; NF-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: retq # encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll index 7fb07c6b3163e..338163d0e1cff 100644 --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -6,22 +6,12 @@ define void @i24_or(ptr %a) { ; X86-LABEL: i24_or: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %ecx -; X86-NEXT: movzbl 2(%eax), %edx -; X86-NEXT: shll $16, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl $384, %edx # imm = 0x180 -; X86-NEXT: movw %dx, (%eax) +; X86-NEXT: orw $384, (%eax) # imm = 0x180 ; X86-NEXT: retl ; ; X64-LABEL: i24_or: ; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzbl 2(%rdi), %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: orl $384, %ecx # imm = 0x180 -; X64-NEXT: movw %cx, (%rdi) +; X64-NEXT: orw $384, (%rdi) # imm = 0x180 ; X64-NEXT: retq %aa = load i24, ptr %a, align 1 %b = or i24 %aa, 384 @@ -103,12 +93,12 @@ define void @i56_or(ptr %a) { ; X86-LABEL: i56_or: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl $384, (%eax) # imm = 0x180 +; X86-NEXT: orw $384, (%eax) # imm = 0x180 ; X86-NEXT: retl ; ; X64-LABEL: i56_or: ; X64: # %bb.0: -; X64-NEXT: orl $384, (%rdi) # imm = 0x180 +; X64-NEXT: orw $384, (%rdi) # imm = 0x180 ; X64-NEXT: retq %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 diff --git a/llvm/test/CodeGen/X86/pr35763.ll b/llvm/test/CodeGen/X86/pr35763.ll index 9d2ee84cf675b..c10a24cffe5f2 100644 --- a/llvm/test/CodeGen/X86/pr35763.ll +++ b/llvm/test/CodeGen/X86/pr35763.ll @@ -14,15 +14,7 @@ define dso_local void @PR35763() { ; CHECK-NEXT: movzwl z+2(%rip), %ecx ; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movq %rcx, tf_3_var_136(%rip) -; CHECK-NEXT: movl z+6(%rip), %eax -; CHECK-NEXT: movzbl z+10(%rip), %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF -; CHECK-NEXT: andq %rcx, %rax -; CHECK-NEXT: movl %eax, z+6(%rip) -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movb %al, z+10(%rip) +; CHECK-NEXT: andl $-33554177, z+7(%rip) # imm = 0xFE0000FF ; CHECK-NEXT: retq entry: %0 = load i16, ptr @z, align 8 diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll index bd7068535eabc..0309fa7bb0310 100644 --- a/llvm/test/CodeGen/X86/store_op_load_fold.ll +++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll @@ -23,7 +23,7 @@ define void @test2() nounwind uwtable ssp { ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl L_s2$non_lazy_ptr, %eax -; CHECK-NEXT: andl $-262144, 20(%eax) ## imm = 0xFFFC0000 +; CHECK-NEXT: andl $-67108609, 19(%eax) ## imm = 0xFC0000FF ; CHECK-NEXT: retl %bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16 %bf.clear36 = and i56 %bf.load35, -1125895611875329