@@ -20342,17 +20342,21 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
2034220342 ST->getPointerInfo().getAddrSpace())
2034320343 return SDValue();
2034420344
20345- // Find the type to narrow it the load / op / store to.
20345+ // Find the type NewVT to narrow the load / op / store to.
2034620346 SDValue N1 = Value.getOperand(1);
2034720347 unsigned BitWidth = N1.getValueSizeInBits();
2034820348 APInt Imm = N1->getAsAPIntVal();
2034920349 if (Opc == ISD::AND)
20350- Imm ^= APInt::getAllOnes(BitWidth );
20350+ Imm.flipAllBits( );
2035120351 if (Imm == 0 || Imm.isAllOnes())
2035220352 return SDValue();
20353- unsigned ShAmt = Imm.countr_zero();
20354- unsigned MSB = BitWidth - Imm.countl_zero() - 1;
20355- unsigned NewBW = NextPowerOf2(MSB - ShAmt);
20353+ // Find least/most significant bit that need to be part of the narrowed
20354+ // operation. We assume target will need to address/access full bytes, so
20355+ // we make sure to align LSB and MSB at byte boundaries.
20356+ unsigned BitsPerByteMask = 7u;
20357+ unsigned LSB = Imm.countr_zero() & ~BitsPerByteMask;
20358+ unsigned MSB = (Imm.getActiveBits() - 1) | BitsPerByteMask;
20359+ unsigned NewBW = NextPowerOf2(MSB - LSB);
2035620360 EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
2035720361 // The narrowing should be profitable, the load/store operation should be
2035820362 // legal (or custom) and the store size should be equal to the NewVT width.
@@ -20367,68 +20371,69 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
2036720371 if (NewBW >= BitWidth)
2036820372 return SDValue();
2036920373
20370- // TODO: For big-endian we probably want to align given the most significant
20371- // bit being modified instead of adjusting ShAmt based on least significant
20372- // bits. This to reduce the risk of failing on the alignment check below. If
20373- // for example VT.getStoreSize()==5 and Imm is 0x0000ffff00, then we want to
20374- // find NewBW=16, and we want to load/store with a PtrOff set to 2. But then
20375- // ShAmt should be set to 8, which isn't a multiple of NewBW. But given
20376- // that isNarrowingProfitable doesn't seem to be overridden for any in-tree
20377- // big-endian target, then the support for big-endian here isn't covered by
20378- // any in-tree lit tests, so it is unfortunately not highly optimized
20379- // either. It should be possible to improve that by using
20380- // ReduceLoadOpStoreWidthForceNarrowingProfitable.
20381-
20382- // If the lsb that is modified does not start at the type bitwidth boundary,
20383- // align to start at the previous boundary.
20384- ShAmt = ShAmt - ( ShAmt % NewBW);
20385-
20386- // Make sure we do not access memory outside the memory touched by the
20387- // original load/store.
20388- if (ShAmt + NewBW > VT.getStoreSizeInBits() )
20389- return SDValue() ;
20374+ // If we come this far NewVT/NewBW reflect a power-of-2 sized type that is
20375+ // large enough to cover all bits that should be modified. This type might
20376+ // however be larger than really needed (such as i32 while we actually only
20377+ // need to modify one byte). Now we need to find our how to align the memory
20378+ // accesses to satisfy preferred alignments as well as avoiding to access
20379+ // memory outside the store size of the orignal access.
20380+
20381+ unsigned VTStoreSize = VT.getStoreSizeInBits().getFixedValue();
20382+
20383+ // Let ShAmt denote amount of bits to skip, counted from the least
20384+ // significant bits of Imm. And let PtrOff how much the pointer needs to be
20385+ // offsetted (in bytes) for the new access.
20386+ unsigned ShAmt = 0;
20387+ uint64_t PtrOff = 0;
20388+ for (; ShAmt + NewBW <= VTStoreSize; ShAmt += 8) {
20389+ // Make sure the range [ShAmt, ShAmt+NewBW) cover both LSB and MSB.
20390+ if (ShAmt > LSB)
20391+ return SDValue();
20392+ if (ShAmt + NewBW < MSB )
20393+ continue ;
2039020394
20391- APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
20392- std::min(BitWidth, ShAmt + NewBW));
20393- if ((Imm & Mask) == Imm) {
20394- APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
20395- if (Opc == ISD::AND)
20396- NewImm ^= APInt::getAllOnes(NewBW);
20397- uint64_t PtrOff = ShAmt / 8;
20398- // For big endian targets, we need to adjust the offset to the pointer to
20399- // load the correct bytes.
20400- if (DAG.getDataLayout().isBigEndian())
20401- PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
20395+ // Calculate PtrOff.
20396+ unsigned PtrAdjustmentInBits = DAG.getDataLayout().isBigEndian()
20397+ ? VTStoreSize - NewBW - ShAmt
20398+ : ShAmt;
20399+ PtrOff = PtrAdjustmentInBits / 8;
2040220400
20401+ // Now check if narrow access is allowed and fast, considering alignments.
2040320402 unsigned IsFast = 0;
2040420403 Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
20405- if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
20406- LD->getAddressSpace(), NewAlign,
20407- LD->getMemOperand()->getFlags(), &IsFast) ||
20408- !IsFast)
20409- return SDValue();
20410-
20411- SDValue NewPtr =
20412- DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
20413- SDValue NewLD =
20414- DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
20415- LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
20416- LD->getMemOperand()->getFlags(), LD->getAAInfo());
20417- SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
20418- DAG.getConstant(NewImm, SDLoc(Value),
20419- NewVT));
20420- SDValue NewST =
20421- DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
20422- ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
20423-
20424- AddToWorklist(NewPtr.getNode());
20425- AddToWorklist(NewLD.getNode());
20426- AddToWorklist(NewVal.getNode());
20427- WorklistRemover DeadNodes(*this);
20428- DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
20429- ++OpsNarrowed;
20430- return NewST;
20404+ if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), NewVT,
20405+ LD->getAddressSpace(), NewAlign,
20406+ LD->getMemOperand()->getFlags(), &IsFast) &&
20407+ IsFast)
20408+ break;
2043120409 }
20410+ // If loop above did not find any accepted ShAmt we need to exit here.
20411+ if (ShAmt + NewBW > VTStoreSize)
20412+ return SDValue();
20413+
20414+ APInt NewImm = Imm.lshr(ShAmt).trunc(NewBW);
20415+ if (Opc == ISD::AND)
20416+ NewImm.flipAllBits();
20417+ Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
20418+ SDValue NewPtr =
20419+ DAG.getMemBasePlusOffset(Ptr, TypeSize::getFixed(PtrOff), SDLoc(LD));
20420+ SDValue NewLD =
20421+ DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
20422+ LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
20423+ LD->getMemOperand()->getFlags(), LD->getAAInfo());
20424+ SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
20425+ DAG.getConstant(NewImm, SDLoc(Value), NewVT));
20426+ SDValue NewST =
20427+ DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
20428+ ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
20429+
20430+ AddToWorklist(NewPtr.getNode());
20431+ AddToWorklist(NewLD.getNode());
20432+ AddToWorklist(NewVal.getNode());
20433+ WorklistRemover DeadNodes(*this);
20434+ DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
20435+ ++OpsNarrowed;
20436+ return NewST;
2043220437 }
2043320438
2043420439 return SDValue();
0 commit comments