Skip to content

Commit 7d3b32c

Browse files
committed
[DAG] Implemented code review comments and added zext
1 parent fc3931d commit 7d3b32c

File tree

1 file changed

+99
-87
lines changed

1 file changed

+99
-87
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 99 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -9398,6 +9398,35 @@ static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
93989398
return BigEndian;
93999399
}
94009400

9401+
// Determines if multiple bytes loaded into a register
9402+
// corresponds to loading a single, contiguous block of bytes from memory and
9403+
// then perform a bitwise right rotation. Returns the rotation amount or
9404+
// std::nullopt if we can't match the pattern.
9405+
static std::optional<unsigned> getRotationAmount(ArrayRef<int64_t> ByteOffsets,
9406+
int64_t FirstOffset) {
9407+
unsigned ByteWidth = ByteOffsets.size();
9408+
if (ByteWidth == 0)
9409+
return std::nullopt;
9410+
9411+
int64_t FirstByteActualOffset = ByteOffsets[0];
9412+
int64_t RotateAmtInBytes = FirstByteActualOffset - FirstOffset;
9413+
9414+
// Check the rotation amount is valid
9415+
if (RotateAmtInBytes < 0 || RotateAmtInBytes >= ByteWidth)
9416+
return std::nullopt;
9417+
9418+
// Make sure each of the following loads follow the same rotational pattern.
9419+
for (unsigned I = 0; I < ByteWidth; ++I) {
9420+
int64_t ExpectedOffset = FirstOffset + ((I + RotateAmtInBytes) % ByteWidth);
9421+
if (ByteOffsets[I] != ExpectedOffset) {
9422+
return std::nullopt;
9423+
}
9424+
}
9425+
9426+
// Return the rotation amount in bits.
9427+
return RotateAmtInBytes * 8;
9428+
}
9429+
94019430
// Look through one layer of truncate or extend.
94029431
static SDValue stripTruncAndExt(SDValue Value) {
94039432
switch (Value.getOpcode()) {
@@ -9772,99 +9801,54 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
97729801
MemVT))
97739802
return SDValue();
97749803

9775-
auto IsRotateLoaded = [](ArrayRef<int64_t> ByteOffsets, int64_t FirstOffset,
9776-
unsigned BitWidth) {
9777-
// Ensure that we have the correct width type, we want to combine two 32
9778-
// loads into a 64 bit load.
9779-
if (BitWidth != 64 || ByteOffsets.size() != 8)
9780-
return false;
9781-
9782-
constexpr unsigned FourBytes = 4;
9783-
9784-
for (unsigned i = 0; i < FourBytes; ++i) {
9785-
// Check the lower 4 bytes come from the higher memory address.
9786-
if (ByteOffsets[i] != FirstOffset + i + FourBytes)
9787-
return false;
9788-
// Check the higher 4 bytes come from the lower memory adderess.
9789-
if (ByteOffsets[i + FourBytes] != FirstOffset + i)
9790-
return false;
9791-
}
9792-
return true;
9793-
};
9794-
97959804
// Check if the bytes of the OR we are looking at match with either big or
97969805
// little endian value load
97979806
std::optional<bool> IsBigEndian = isBigEndian(
97989807
ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
97999808

9800-
bool IsRotated = false;
9801-
if (!IsBigEndian) {
9802-
IsRotated =
9803-
IsRotateLoaded(ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes),
9804-
FirstOffset, VT.getSizeInBits());
9805-
9806-
if (!IsRotated)
9809+
// Handle the standard load combine.
9810+
if (IsBigEndian) {
9811+
bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
9812+
9813+
// Before legalize we can introduce illegal bswaps which will be later
9814+
// converted to an explicit bswap sequence. This way we end up with a single
9815+
// load and byte shuffling instead of several loads and byte shuffling.
9816+
// We do not introduce illegal bswaps when zero-extending as this tends to
9817+
// introduce too many arithmetic instructions.
9818+
if (NeedsBswap && (LegalOperations || NeedsZext) &&
9819+
!TLI.isOperationLegal(ISD::BSWAP, VT))
98079820
return SDValue();
9808-
}
9809-
9810-
assert(FirstByteProvider && "must be set");
9811-
9812-
// Ensure that the first byte is loaded from zero offset of the first load.
9813-
// So the combined value can be loaded from the first load address.
9814-
if (MemoryByteOffset(*FirstByteProvider) != 0)
9815-
return SDValue();
9816-
auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
9817-
9818-
// The node we are looking at matches with the pattern, check if we can
9819-
// replace it with a single (possibly zero-extended) load and bswap + shift if
9820-
// needed.
9821-
9822-
// If the load needs byte swap check if the target supports it, make sure that
9823-
// we are not rotating.
9824-
bool NeedsBswap = !IsRotated && (IsBigEndianTarget != *IsBigEndian);
9825-
9826-
// Before legalize we can introduce illegal bswaps which will be later
9827-
// converted to an explicit bswap sequence. This way we end up with a single
9828-
// load and byte shuffling instead of several loads and byte shuffling.
9829-
// We do not introduce illegal bswaps when zero-extending as this tends to
9830-
// introduce too many arithmetic instructions.
9831-
if (NeedsBswap && (LegalOperations || NeedsZext) &&
9832-
!TLI.isOperationLegal(ISD::BSWAP, VT))
9833-
return SDValue();
98349821

9835-
// If we need to rotate make sure that is legal.
9836-
if (IsRotated && LegalOperations && !TLI.isOperationLegal(ISD::ROTR, VT))
9837-
return SDValue();
9822+
// If we need to bswap and zero extend, we have to insert a shift. Check
9823+
// thatunsigned Fast = 0; it is legal.
9824+
if (NeedsBswap && NeedsZext && LegalOperations &&
9825+
!TLI.isOperationLegal(ISD::SHL, VT))
9826+
return SDValue();
98389827

9839-
// If we need to bswap and zero extend, we have to insert a shift. Check
9840-
// thatunsigned Fast = 0; it is legal.
9841-
if (NeedsBswap && NeedsZext && LegalOperations &&
9842-
!TLI.isOperationLegal(ISD::SHL, VT))
9843-
return SDValue();
9828+
auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
9829+
if (MemoryByteOffset(*FirstByteProvider) != 0)
9830+
return SDValue();
98449831

9845-
// Check that a load of the wide type is both allowed and fast on the target
9846-
unsigned Fast = 0;
9847-
bool Allowed =
9848-
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
9849-
*FirstLoad->getMemOperand(), &Fast);
9850-
if (!Allowed || !Fast)
9851-
return SDValue();
9832+
// Check that a load of the wide type is both allowed and fast on the target
9833+
unsigned Fast = 0;
9834+
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
9835+
*FirstLoad->getMemOperand(), &Fast) ||
9836+
!Fast)
9837+
return SDValue();
98529838

9853-
SDValue NewLoad =
9854-
DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT,
9855-
Chain, FirstLoad->getBasePtr(),
9856-
FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign());
9839+
SDValue NewLoad = DAG.getExtLoad(
9840+
NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, Chain,
9841+
FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), MemVT,
9842+
FirstLoad->getAlign());
98579843

9858-
// Transfer chain users from old loads to the new load.
9859-
for (LoadSDNode *L : Loads)
9860-
DAG.makeEquivalentMemoryOrdering(L, NewLoad);
9844+
for (LoadSDNode *L : Loads)
9845+
DAG.makeEquivalentMemoryOrdering(L, NewLoad);
98619846

9862-
// If no transform is needed then return the new load.
9863-
if (!NeedsBswap && !IsRotated)
9864-
return NewLoad;
9847+
// It is a simple combine.
9848+
if (!NeedsBswap)
9849+
return NewLoad;
98659850

9866-
// If we detect the need to BSWAP build the new node and return it.
9867-
if (NeedsBswap) {
9851+
// It is a BSWAP combine.
98689852
SDValue ShiftedLoad =
98699853
NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
98709854
DAG.getShiftAmountConstant(
@@ -9873,19 +9857,47 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
98739857
return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
98749858
}
98759859

9876-
// If we detect we need to rotate build the new ROTR node.
9877-
if (IsRotated) {
9878-
// The amount to rotate is half that of the size, i.e 32 bits for an i64
9879-
unsigned RotateAmount = VT.getSizeInBits() / 2;
9860+
// Handle the rotated load combine.
9861+
if (auto RotateAmt = getRotationAmount(
9862+
ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset)) {
9863+
9864+
// Make sure we can rotate
9865+
if (LegalOperations && !TLI.isOperationLegal(ISD::ROTR, VT))
9866+
return SDValue();
9867+
9868+
auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
9869+
if (MemoryByteOffset(*FirstByteProvider) != 0)
9870+
return SDValue();
9871+
9872+
// Make sure the operation is legal and fast.
9873+
unsigned Fast = 0;
9874+
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
9875+
*FirstLoad->getMemOperand(), &Fast) ||
9876+
!Fast)
9877+
return SDValue();
9878+
9879+
// Create the new load, rotate and then zero extend after if we need to.
9880+
SDValue NewLoad =
9881+
DAG.getLoad(MemVT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
9882+
FirstLoad->getPointerInfo());
9883+
9884+
for (LoadSDNode *L : Loads)
9885+
DAG.makeEquivalentMemoryOrdering(L, NewLoad);
98809886

98819887
EVT ShiftAmountTy =
98829888
TLI.getShiftAmountTy(NewLoad.getValueType(), DAG.getDataLayout());
9889+
SDValue Rotated =
9890+
DAG.getNode(ISD::ROTR, SDLoc(N), MemVT, NewLoad,
9891+
DAG.getConstant(*RotateAmt, SDLoc(N), ShiftAmountTy));
98839892

9884-
return DAG.getNode(ISD::ROTR, SDLoc(N), VT, NewLoad,
9885-
DAG.getConstant(RotateAmount, SDLoc(N), ShiftAmountTy));
9893+
if (NeedsZext)
9894+
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Rotated);
9895+
9896+
return Rotated;
98869897
}
98879898

9888-
llvm_unreachable("Should have returned a transformed load value");
9899+
// No pattern matched.
9900+
return SDValue();
98899901
}
98909902

98919903
// If the target has andn, bsl, or a similar bit-select instruction,

0 commit comments

Comments
 (0)