diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 907c4577d93d3..9d888d9940a1c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -20342,17 +20342,21 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { ST->getPointerInfo().getAddrSpace()) return SDValue(); - // Find the type to narrow it the load / op / store to. + // Find the type NewVT to narrow the load / op / store to. SDValue N1 = Value.getOperand(1); unsigned BitWidth = N1.getValueSizeInBits(); APInt Imm = N1->getAsAPIntVal(); if (Opc == ISD::AND) - Imm ^= APInt::getAllOnes(BitWidth); + Imm.flipAllBits(); if (Imm == 0 || Imm.isAllOnes()) return SDValue(); - unsigned ShAmt = Imm.countr_zero(); - unsigned MSB = BitWidth - Imm.countl_zero() - 1; - unsigned NewBW = NextPowerOf2(MSB - ShAmt); + // Find least/most significant bit that need to be part of the narrowed + // operation. We assume target will need to address/access full bytes, so + // we make sure to align LSB and MSB at byte boundaries. + unsigned BitsPerByteMask = 7u; + unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask; + unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask; + unsigned NewBW = NextPowerOf2(MSB - LSB); EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW); // The narrowing should be profitable, the load/store operation should be // legal (or custom) and the store size should be equal to the NewVT width. @@ -20367,68 +20371,69 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) { if (NewBW >= BitWidth) return SDValue(); - // TODO: For big-endian we probably want to align given the most significant - // bit being modified instead of adjusting ShAmt based on least significant - // bits. This to reduce the risk of failing on the alignment check below. If - // for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to - // find NewBW=16, and we want to load/store with a PtrOff set to 2. But then - // ShAmt should be set to 8, which isn't a multiple of NewBW. But given - // that isNarrowingProfitable doesn't seem to be overridden for any in-tree - // big-endian target, then the support for big-endian here isn't covered by - // any in-tree lit tests, so it is unfortunately not highly optimized - // either. It should be possible to improve that by using - // ReduceLoadOpStoreWidthForceNarrowingProfitable. - - // If the lsb that is modified does not start at the type bitwidth boundary, - // align to start at the previous boundary. - ShAmt = ShAmt - (ShAmt % NewBW); - - // Make sure we do not access memory outside the memory touched by the - // original load/store. - if (ShAmt + NewBW > VT.getStoreSizeInBits()) - return SDValue(); + // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is + // large enough to cover all bits that should be modified. This type might + // however be larger than really needed (such as i32 while we actually only + // need to modify one byte). Now we need to find our how to align the memory + // accesses to satisfy preferred alignments as well as avoiding to access + // memory outside the store size of the orignal access. + + unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue(); + + // Let ShAmt denote amount of bits to skip, counted from the least + // significant bits of Imm. And let PtrOff how much the pointer needs to be + // offsetted (in bytes) for the new access. + unsigned ShAmt = 0; + uint64_t PtrOff = 0; + for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) { + // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB. + if (ShAmt > LSB) + return SDValue(); + if (ShAmt + NewBW < MSB) + continue; - APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, - std::min(BitWidth, ShAmt + NewBW)); - if ((Imm & Mask) == Imm) { - APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW); - if (Opc == ISD::AND) - NewImm ^= APInt::getAllOnes(NewBW); - uint64_t PtrOff = ShAmt / 8; - // For big endian targets, we need to adjust the offset to the pointer to - // load the correct bytes. - if (DAG.getDataLayout().isBigEndian()) - PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff; + // Calculate PtrOff. + unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian() + ? VTStoreSize - NewBW - ShAmt + : ShAmt; + PtrOff = PtrAdjustmentInBits / 8; + // Now check if narrow access is allowed and fast, considering alignments. unsigned IsFast = 0; Align NewAlign = commonAlignment(LD->getAlign(), PtrOff); - if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT, - LD->getAddressSpace(), NewAlign, - LD->getMemOperand()->getFlags(), &IsFast) || - !IsFast) - return SDValue(); - - SDValue NewPtr = - DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD)); - SDValue NewLD = - DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, - LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, - LD->getMemOperand()->getFlags(), LD->getAAInfo()); - SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, - DAG.getConstant(NewImm, SDLoc(Value), - NewVT)); - SDValue NewST = - DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, - ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); - - AddToWorklist(NewPtr.getNode()); - AddToWorklist(NewLD.getNode()); - AddToWorklist(NewVal.getNode()); - WorklistRemover DeadNodes(*this); - DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); - ++OpsNarrowed; - return NewST; + if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT, + LD->getAddressSpace(), NewAlign, + LD->getMemOperand()->getFlags(), &IsFast) && + IsFast) + break; } + // If loop above did not find any accepted ShAmt we need to exit here. + if (ShAmt + NewBW > VTStoreSize) + return SDValue(); + + APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW); + if (Opc == ISD::AND) + NewImm.flipAllBits(); + Align NewAlign = commonAlignment(LD->getAlign(), PtrOff); + SDValue NewPtr = + DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD)); + SDValue NewLD = + DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr, + LD->getPointerInfo().getWithOffset(PtrOff), NewAlign, + LD->getMemOperand()->getFlags(), LD->getAAInfo()); + SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD, + DAG.getConstant(NewImm, SDLoc(Value), NewVT)); + SDValue NewST = + DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr, + ST->getPointerInfo().getWithOffset(PtrOff), NewAlign); + + AddToWorklist(NewPtr.getNode()); + AddToWorklist(NewLD.getNode()); + AddToWorklist(NewVal.getNode()); + WorklistRemover DeadNodes(*this); + DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1)); + ++OpsNarrowed; + return NewST; } return SDValue(); diff --git a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll index efdfa10f7ca07..6819af3ba3e26 100644 --- a/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll +++ b/llvm/test/CodeGen/ARM/dagcombine-ld-op-st.ll @@ -32,10 +32,10 @@ define i16 @test(ptr %p1) { ; ; CHECK-LE-NARROW-LABEL: test: ; CHECK-LE-NARROW: @ %bb.0: @ %entry -; CHECK-LE-NARROW-NEXT: ldrh r1, [r0, #8] -; CHECK-LE-NARROW-NEXT: movw r2, #65534 -; CHECK-LE-NARROW-NEXT: orr r1, r1, r2 -; CHECK-LE-NARROW-NEXT: strh r1, [r0, #8] +; CHECK-LE-NARROW-NEXT: ldr r1, [r0, #6] +; CHECK-LE-NARROW-NEXT: orr r1, r1, #16646144 +; CHECK-LE-NARROW-NEXT: orr r1, r1, #-16777216 +; CHECK-LE-NARROW-NEXT: str r1, [r0, #6] ; CHECK-LE-NARROW-NEXT: mov r0, #0 ; CHECK-LE-NARROW-NEXT: bx lr ; @@ -50,10 +50,10 @@ define i16 @test(ptr %p1) { ; ; CHECK-BE-NARROW-LABEL: test: ; CHECK-BE-NARROW: @ %bb.0: @ %entry -; CHECK-BE-NARROW-NEXT: ldrh r1, [r0] -; CHECK-BE-NARROW-NEXT: movw r2, #65534 -; CHECK-BE-NARROW-NEXT: orr r1, r1, r2 -; CHECK-BE-NARROW-NEXT: strh r1, [r0] +; CHECK-BE-NARROW-NEXT: ldr r1, [r0] +; CHECK-BE-NARROW-NEXT: orr r1, r1, #16646144 +; CHECK-BE-NARROW-NEXT: orr r1, r1, #-16777216 +; CHECK-BE-NARROW-NEXT: str r1, [r0] ; CHECK-BE-NARROW-NEXT: mov r0, #0 ; CHECK-BE-NARROW-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll index e51ba9d9bf039..995946c007dca 100644 --- a/llvm/test/CodeGen/X86/apx/or.ll +++ b/llvm/test/CodeGen/X86/apx/or.ll @@ -906,13 +906,13 @@ entry: define void @or64mi_legacy(ptr %a) { ; CHECK-LABEL: or64mi_legacy: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00] +; CHECK-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00] ; CHECK-NEXT: # imm = 0x1E240 ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: or64mi_legacy: ; NF: # %bb.0: # %entry -; NF-NEXT: orq $123456, (%rdi) # encoding: [0x48,0x81,0x0f,0x40,0xe2,0x01,0x00] +; NF-NEXT: orl $123456, (%rdi) # encoding: [0x81,0x0f,0x40,0xe2,0x01,0x00] ; NF-NEXT: # imm = 0x1E240 ; NF-NEXT: retq # encoding: [0xc3] entry: diff --git a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll index 7fb07c6b3163e..338163d0e1cff 100644 --- a/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll +++ b/llvm/test/CodeGen/X86/illegal-bitfield-loadstore.ll @@ -6,22 +6,12 @@ define void @i24_or(ptr %a) { ; X86-LABEL: i24_or: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl (%eax), %ecx -; X86-NEXT: movzbl 2(%eax), %edx -; X86-NEXT: shll $16, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl $384, %edx # imm = 0x180 -; X86-NEXT: movw %dx, (%eax) +; X86-NEXT: orw $384, (%eax) # imm = 0x180 ; X86-NEXT: retl ; ; X64-LABEL: i24_or: ; X64: # %bb.0: -; X64-NEXT: movzwl (%rdi), %eax -; X64-NEXT: movzbl 2(%rdi), %ecx -; X64-NEXT: shll $16, %ecx -; X64-NEXT: orl %eax, %ecx -; X64-NEXT: orl $384, %ecx # imm = 0x180 -; X64-NEXT: movw %cx, (%rdi) +; X64-NEXT: orw $384, (%rdi) # imm = 0x180 ; X64-NEXT: retq %aa = load i24, ptr %a, align 1 %b = or i24 %aa, 384 @@ -103,12 +93,12 @@ define void @i56_or(ptr %a) { ; X86-LABEL: i56_or: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl $384, (%eax) # imm = 0x180 +; X86-NEXT: orw $384, (%eax) # imm = 0x180 ; X86-NEXT: retl ; ; X64-LABEL: i56_or: ; X64: # %bb.0: -; X64-NEXT: orl $384, (%rdi) # imm = 0x180 +; X64-NEXT: orw $384, (%rdi) # imm = 0x180 ; X64-NEXT: retq %aa = load i56, ptr %a, align 1 %b = or i56 %aa, 384 diff --git a/llvm/test/CodeGen/X86/pr35763.ll b/llvm/test/CodeGen/X86/pr35763.ll index 9d2ee84cf675b..c10a24cffe5f2 100644 --- a/llvm/test/CodeGen/X86/pr35763.ll +++ b/llvm/test/CodeGen/X86/pr35763.ll @@ -14,15 +14,7 @@ define dso_local void @PR35763() { ; CHECK-NEXT: movzwl z+2(%rip), %ecx ; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movq %rcx, tf_3_var_136(%rip) -; CHECK-NEXT: movl z+6(%rip), %eax -; CHECK-NEXT: movzbl z+10(%rip), %ecx -; CHECK-NEXT: shlq $32, %rcx -; CHECK-NEXT: orq %rax, %rcx -; CHECK-NEXT: movabsq $1090921758719, %rax # imm = 0xFE0000FFFF -; CHECK-NEXT: andq %rcx, %rax -; CHECK-NEXT: movl %eax, z+6(%rip) -; CHECK-NEXT: shrq $32, %rax -; CHECK-NEXT: movb %al, z+10(%rip) +; CHECK-NEXT: andl $-33554177, z+7(%rip) # imm = 0xFE0000FF ; CHECK-NEXT: retq entry: %0 = load i16, ptr @z, align 8 diff --git a/llvm/test/CodeGen/X86/store_op_load_fold.ll b/llvm/test/CodeGen/X86/store_op_load_fold.ll index 2915d1a7d7ae1..0309fa7bb0310 100644 --- a/llvm/test/CodeGen/X86/store_op_load_fold.ll +++ b/llvm/test/CodeGen/X86/store_op_load_fold.ll @@ -23,10 +23,7 @@ define void @test2() nounwind uwtable ssp { ; CHECK-LABEL: test2: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl L_s2$non_lazy_ptr, %eax -; CHECK-NEXT: movzbl 22(%eax), %ecx -; CHECK-NEXT: andl $-4, %ecx -; CHECK-NEXT: movb %cl, 22(%eax) -; CHECK-NEXT: movw $0, 20(%eax) +; CHECK-NEXT: andl $-67108609, 19(%eax) ## imm = 0xFC0000FF ; CHECK-NEXT: retl %bf.load35 = load i56, ptr getelementptr inbounds (%struct.S2, ptr @s2, i32 0, i32 5), align 16 %bf.clear36 = and i56 %bf.load35, -1125895611875329