@@ -9398,6 +9398,35 @@ static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
93989398 return BigEndian;
93999399}
94009400
9401+ // Determines if multiple bytes loaded into a register
9402+ // corresponds to loading a single, contiguous block of bytes from memory and
9403+ // then perform a bitwise right rotation. Returns the rotation amount or
9404+ // std::nullopt if we can't match the pattern.
9405+ static std::optional<unsigned> getRotationAmount(ArrayRef<int64_t> ByteOffsets,
9406+ int64_t FirstOffset) {
9407+ unsigned ByteWidth = ByteOffsets.size();
9408+ if (ByteWidth == 0)
9409+ return std::nullopt;
9410+
9411+ int64_t FirstByteActualOffset = ByteOffsets[0];
9412+ int64_t RotateAmtInBytes = FirstByteActualOffset - FirstOffset;
9413+
9414+ // Check the rotation amount is valid
9415+ if (RotateAmtInBytes < 0 || RotateAmtInBytes >= ByteWidth)
9416+ return std::nullopt;
9417+
9418+ // Make sure each of the following loads follow the same rotational pattern.
9419+ for (unsigned I = 0; I < ByteWidth; ++I) {
9420+ int64_t ExpectedOffset = FirstOffset + ((I + RotateAmtInBytes) % ByteWidth);
9421+ if (ByteOffsets[I] != ExpectedOffset) {
9422+ return std::nullopt;
9423+ }
9424+ }
9425+
9426+ // Return the rotation amount in bits.
9427+ return RotateAmtInBytes * 8;
9428+ }
9429+
94019430// Look through one layer of truncate or extend.
94029431static SDValue stripTruncAndExt(SDValue Value) {
94039432 switch (Value.getOpcode()) {
@@ -9772,99 +9801,54 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
97729801 MemVT))
97739802 return SDValue();
97749803
9775- auto IsRotateLoaded = [](ArrayRef<int64_t> ByteOffsets, int64_t FirstOffset,
9776- unsigned BitWidth) {
9777- // Ensure that we have the correct width type, we want to combine two 32
9778- // loads into a 64 bit load.
9779- if (BitWidth != 64 || ByteOffsets.size() != 8)
9780- return false;
9781-
9782- constexpr unsigned FourBytes = 4;
9783-
9784- for (unsigned i = 0; i < FourBytes; ++i) {
9785- // Check the lower 4 bytes come from the higher memory address.
9786- if (ByteOffsets[i] != FirstOffset + i + FourBytes)
9787- return false;
9788- // Check the higher 4 bytes come from the lower memory adderess.
9789- if (ByteOffsets[i + FourBytes] != FirstOffset + i)
9790- return false;
9791- }
9792- return true;
9793- };
9794-
97959804 // Check if the bytes of the OR we are looking at match with either big or
97969805 // little endian value load
97979806 std::optional<bool> IsBigEndian = isBigEndian(
97989807 ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
97999808
9800- bool IsRotated = false;
9801- if (!IsBigEndian) {
9802- IsRotated =
9803- IsRotateLoaded(ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes),
9804- FirstOffset, VT.getSizeInBits());
9805-
9806- if (!IsRotated)
9809+ // Handle the standard load combine.
9810+ if (IsBigEndian) {
9811+ bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
9812+
9813+ // Before legalize we can introduce illegal bswaps which will be later
9814+ // converted to an explicit bswap sequence. This way we end up with a single
9815+ // load and byte shuffling instead of several loads and byte shuffling.
9816+ // We do not introduce illegal bswaps when zero-extending as this tends to
9817+ // introduce too many arithmetic instructions.
9818+ if (NeedsBswap && (LegalOperations || NeedsZext) &&
9819+ !TLI.isOperationLegal(ISD::BSWAP, VT))
98079820 return SDValue();
9808- }
9809-
9810- assert(FirstByteProvider && "must be set");
9811-
9812- // Ensure that the first byte is loaded from zero offset of the first load.
9813- // So the combined value can be loaded from the first load address.
9814- if (MemoryByteOffset(*FirstByteProvider) != 0)
9815- return SDValue();
9816- auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
9817-
9818- // The node we are looking at matches with the pattern, check if we can
9819- // replace it with a single (possibly zero-extended) load and bswap + shift if
9820- // needed.
9821-
9822- // If the load needs byte swap check if the target supports it, make sure that
9823- // we are not rotating.
9824- bool NeedsBswap = !IsRotated && (IsBigEndianTarget != *IsBigEndian);
9825-
9826- // Before legalize we can introduce illegal bswaps which will be later
9827- // converted to an explicit bswap sequence. This way we end up with a single
9828- // load and byte shuffling instead of several loads and byte shuffling.
9829- // We do not introduce illegal bswaps when zero-extending as this tends to
9830- // introduce too many arithmetic instructions.
9831- if (NeedsBswap && (LegalOperations || NeedsZext) &&
9832- !TLI.isOperationLegal(ISD::BSWAP, VT))
9833- return SDValue();
98349821
9835- // If we need to rotate make sure that is legal.
9836- if (IsRotated && LegalOperations && !TLI.isOperationLegal(ISD::ROTR, VT))
9837- return SDValue();
9822+ // If we need to bswap and zero extend, we have to insert a shift. Check
9823+ // thatunsigned Fast = 0; it is legal.
9824+ if (NeedsBswap && NeedsZext && LegalOperations &&
9825+ !TLI.isOperationLegal(ISD::SHL, VT))
9826+ return SDValue();
98389827
9839- // If we need to bswap and zero extend, we have to insert a shift. Check
9840- // thatunsigned Fast = 0; it is legal.
9841- if (NeedsBswap && NeedsZext && LegalOperations &&
9842- !TLI.isOperationLegal(ISD::SHL, VT))
9843- return SDValue();
9828+ auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
9829+ if (MemoryByteOffset(*FirstByteProvider) != 0)
9830+ return SDValue();
98449831
9845- // Check that a load of the wide type is both allowed and fast on the target
9846- unsigned Fast = 0;
9847- bool Allowed =
9848- TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
9849- *FirstLoad->getMemOperand(), &Fast);
9850- if (!Allowed || !Fast)
9851- return SDValue();
9832+ // Check that a load of the wide type is both allowed and fast on the target
9833+ unsigned Fast = 0;
9834+ if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
9835+ *FirstLoad->getMemOperand(), &Fast) ||
9836+ !Fast)
9837+ return SDValue();
98529838
9853- SDValue NewLoad =
9854- DAG.getExtLoad( NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT,
9855- Chain , FirstLoad->getBasePtr() ,
9856- FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign());
9839+ SDValue NewLoad = DAG.getExtLoad(
9840+ NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, Chain ,
9841+ FirstLoad->getBasePtr() , FirstLoad->getPointerInfo(), MemVT ,
9842+ FirstLoad->getAlign());
98579843
9858- // Transfer chain users from old loads to the new load.
9859- for (LoadSDNode *L : Loads)
9860- DAG.makeEquivalentMemoryOrdering(L, NewLoad);
9844+ for (LoadSDNode *L : Loads)
9845+ DAG.makeEquivalentMemoryOrdering(L, NewLoad);
98619846
9862- // If no transform is needed then return the new load .
9863- if (!NeedsBswap && !IsRotated )
9864- return NewLoad;
9847+ // It is a simple combine .
9848+ if (!NeedsBswap)
9849+ return NewLoad;
98659850
9866- // If we detect the need to BSWAP build the new node and return it.
9867- if (NeedsBswap) {
9851+ // It is a BSWAP combine.
98689852 SDValue ShiftedLoad =
98699853 NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
98709854 DAG.getShiftAmountConstant(
@@ -9873,19 +9857,47 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
98739857 return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
98749858 }
98759859
9876- // If we detect we need to rotate build the new ROTR node.
9877- if (IsRotated) {
9878- // The amount to rotate is half that of the size, i.e 32 bits for an i64
9879- unsigned RotateAmount = VT.getSizeInBits() / 2;
9860+ // Handle the rotated load combine.
9861+ if (auto RotateAmt = getRotationAmount(
9862+ ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset)) {
9863+
9864+ // Make sure we can rotate
9865+ if (LegalOperations && !TLI.isOperationLegal(ISD::ROTR, VT))
9866+ return SDValue();
9867+
9868+ auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
9869+ if (MemoryByteOffset(*FirstByteProvider) != 0)
9870+ return SDValue();
9871+
9872+ // Make sure the operation is legal and fast.
9873+ unsigned Fast = 0;
9874+ if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
9875+ *FirstLoad->getMemOperand(), &Fast) ||
9876+ !Fast)
9877+ return SDValue();
9878+
9879+ // Create the new load, rotate and then zero extend after if we need to.
9880+ SDValue NewLoad =
9881+ DAG.getLoad(MemVT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
9882+ FirstLoad->getPointerInfo());
9883+
9884+ for (LoadSDNode *L : Loads)
9885+ DAG.makeEquivalentMemoryOrdering(L, NewLoad);
98809886
98819887 EVT ShiftAmountTy =
98829888 TLI.getShiftAmountTy(NewLoad.getValueType(), DAG.getDataLayout());
9889+ SDValue Rotated =
9890+ DAG.getNode(ISD::ROTR, SDLoc(N), MemVT, NewLoad,
9891+ DAG.getConstant(*RotateAmt, SDLoc(N), ShiftAmountTy));
98839892
9884- return DAG.getNode(ISD::ROTR, SDLoc(N), VT, NewLoad,
9885- DAG.getConstant(RotateAmount, SDLoc(N), ShiftAmountTy));
9893+ if (NeedsZext)
9894+ return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Rotated);
9895+
9896+ return Rotated;
98869897 }
98879898
9888- llvm_unreachable("Should have returned a transformed load value");
9899+ // No pattern matched.
9900+ return SDValue();
98899901}
98909902
98919903// If the target has andn, bsl, or a similar bit-select instruction,
0 commit comments