Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 111 additions & 48 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9398,6 +9398,35 @@ static std::optional<bool> isBigEndian(ArrayRef<int64_t> ByteOffsets,
return BigEndian;
}

// Determines if multiple bytes loaded into a register
// corresponds to loading a single, contiguous block of bytes from memory and
// then perform a bitwise right rotation. Returns the rotation amount or
// std::nullopt if we can't match the pattern.
static std::optional<unsigned> getRotationAmount(ArrayRef<int64_t> ByteOffsets,
int64_t FirstOffset) {
unsigned ByteWidth = ByteOffsets.size();
if (ByteWidth == 0)
return std::nullopt;

int64_t FirstByteActualOffset = ByteOffsets[0];
int64_t RotateAmtInBytes = FirstByteActualOffset - FirstOffset;

// Check the rotation amount is valid
if (RotateAmtInBytes < 0 || RotateAmtInBytes >= ByteWidth)
return std::nullopt;

// Make sure each of the following loads follow the same rotational pattern.
for (unsigned I = 0; I < ByteWidth; ++I) {
int64_t ExpectedOffset = FirstOffset + ((I + RotateAmtInBytes) % ByteWidth);
if (ByteOffsets[I] != ExpectedOffset) {
return std::nullopt;
}
}

// Return the rotation amount in bits.
return RotateAmtInBytes * 8;
}

// Look through one layer of truncate or extend.
static SDValue stripTruncAndExt(SDValue Value) {
switch (Value.getOpcode()) {
Expand Down Expand Up @@ -9776,65 +9805,99 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
// little endian value load
std::optional<bool> IsBigEndian = isBigEndian(
ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset);
if (!IsBigEndian)
return SDValue();

assert(FirstByteProvider && "must be set");
// Handle the standard load combine.
if (IsBigEndian) {
bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;

// Before legalize we can introduce illegal bswaps which will be later
// converted to an explicit bswap sequence. This way we end up with a single
// load and byte shuffling instead of several loads and byte shuffling.
// We do not introduce illegal bswaps when zero-extending as this tends to
// introduce too many arithmetic instructions.
if (NeedsBswap && (LegalOperations || NeedsZext) &&
!TLI.isOperationLegal(ISD::BSWAP, VT))
return SDValue();

// Ensure that the first byte is loaded from zero offset of the first load.
// So the combined value can be loaded from the first load address.
if (MemoryByteOffset(*FirstByteProvider) != 0)
return SDValue();
auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
// If we need to bswap and zero extend, we have to insert a shift. Check
// thatunsigned Fast = 0; it is legal.
if (NeedsBswap && NeedsZext && LegalOperations &&
!TLI.isOperationLegal(ISD::SHL, VT))
return SDValue();

// The node we are looking at matches with the pattern, check if we can
// replace it with a single (possibly zero-extended) load and bswap + shift if
// needed.
auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
if (MemoryByteOffset(*FirstByteProvider) != 0)
return SDValue();

// If the load needs byte swap check if the target supports it
bool NeedsBswap = IsBigEndianTarget != *IsBigEndian;
// Check that a load of the wide type is both allowed and fast on the target
unsigned Fast = 0;
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
*FirstLoad->getMemOperand(), &Fast) ||
!Fast)
return SDValue();

// Before legalize we can introduce illegal bswaps which will be later
// converted to an explicit bswap sequence. This way we end up with a single
// load and byte shuffling instead of several loads and byte shuffling.
// We do not introduce illegal bswaps when zero-extending as this tends to
// introduce too many arithmetic instructions.
if (NeedsBswap && (LegalOperations || NeedsZext) &&
!TLI.isOperationLegal(ISD::BSWAP, VT))
return SDValue();
SDValue NewLoad = DAG.getExtLoad(
NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT, Chain,
FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(), MemVT,
FirstLoad->getAlign());

// If we need to bswap and zero extend, we have to insert a shift. Check that
// it is legal.
if (NeedsBswap && NeedsZext && LegalOperations &&
!TLI.isOperationLegal(ISD::SHL, VT))
return SDValue();
for (LoadSDNode *L : Loads)
DAG.makeEquivalentMemoryOrdering(L, NewLoad);

// Check that a load of the wide type is both allowed and fast on the target
unsigned Fast = 0;
bool Allowed =
TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
*FirstLoad->getMemOperand(), &Fast);
if (!Allowed || !Fast)
return SDValue();
// It is a simple combine.
if (!NeedsBswap)
return NewLoad;

SDValue NewLoad =
DAG.getExtLoad(NeedsZext ? ISD::ZEXTLOAD : ISD::NON_EXTLOAD, SDLoc(N), VT,
Chain, FirstLoad->getBasePtr(),
FirstLoad->getPointerInfo(), MemVT, FirstLoad->getAlign());
// It is a BSWAP combine.
SDValue ShiftedLoad =
NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
DAG.getShiftAmountConstant(
ZeroExtendedBytes * 8, VT, SDLoc(N)))
: NewLoad;
return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
}

// Transfer chain users from old loads to the new load.
for (LoadSDNode *L : Loads)
DAG.makeEquivalentMemoryOrdering(L, NewLoad);
// Handle the rotated load combine.
if (auto RotateAmt = getRotationAmount(
ArrayRef(ByteOffsets).drop_back(ZeroExtendedBytes), FirstOffset)) {

if (!NeedsBswap)
return NewLoad;
// Make sure we can rotate
if (LegalOperations && !TLI.isOperationLegal(ISD::ROTR, VT))
return SDValue();

SDValue ShiftedLoad =
NeedsZext ? DAG.getNode(ISD::SHL, SDLoc(N), VT, NewLoad,
DAG.getShiftAmountConstant(ZeroExtendedBytes * 8,
VT, SDLoc(N)))
: NewLoad;
return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, ShiftedLoad);
auto *FirstLoad = cast<LoadSDNode>(FirstByteProvider->Src.value());
if (MemoryByteOffset(*FirstByteProvider) != 0)
return SDValue();

// Make sure the operation is legal and fast.
unsigned Fast = 0;
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
*FirstLoad->getMemOperand(), &Fast) ||
!Fast)
return SDValue();

// Create the new load, rotate and then zero extend after if we need to.
SDValue NewLoad =
DAG.getLoad(MemVT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
FirstLoad->getPointerInfo());

for (LoadSDNode *L : Loads)
DAG.makeEquivalentMemoryOrdering(L, NewLoad);

EVT ShiftAmountTy =
TLI.getShiftAmountTy(NewLoad.getValueType(), DAG.getDataLayout());
SDValue Rotated =
DAG.getNode(ISD::ROTR, SDLoc(N), MemVT, NewLoad,
DAG.getConstant(*RotateAmt, SDLoc(N), ShiftAmountTy));

if (NeedsZext)
return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Rotated);

return Rotated;
}

// No pattern matched.
return SDValue();
}

// If the target has andn, bsl, or a similar bit-select instruction,
Expand Down
63 changes: 63 additions & 0 deletions llvm/test/CodeGen/X86/load-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1314,3 +1314,66 @@ define i32 @pr80911_vector_load_multiuse(ptr %ptr, ptr %clobber) nounwind {
%res = or i32 %e1.ext.shift, %e0.ext
ret i32 %res
}

define i64 @test_load_bswap_to_rotate(ptr %p) {
; CHECK-LABEL: test_load_bswap_to_rotate:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %edx
; CHECK-NEXT: movl 4(%eax), %eax
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test_load_bswap_to_rotate:
; CHECK64: # %bb.0:
; CHECK64-NEXT: movq (%rdi), %rax
; CHECK64-NEXT: rorq $32, %rax
; CHECK64-NEXT: retq

%p.hi = getelementptr inbounds nuw i8, ptr %p, i64 4
%lo = load i32, ptr %p
%hi = load i32, ptr %p.hi
%conv = zext i32 %lo to i64
%shl = shl nuw i64 %conv, 32
%conv2 = zext i32 %hi to i64
%or = or disjoint i64 %shl, %conv2
ret i64 %or
}

define i64 @test_load_rotate_zext(ptr %p) {
; CHECK-LABEL: test_load_rotate_zext:
; CHECK: # %bb.0:
; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax
; CHECK-NEXT: movl (%eax), %eax
; CHECK-NEXT: rorl $8, %eax
; CHECK-NEXT: xorl %edx, %edx
; CHECK-NEXT: retl
;
; CHECK64-LABEL: test_load_rotate_zext:
; CHECK64: # %bb.0:
; CHECK64-NEXT: movl (%rdi), %eax
; CHECK64-NEXT: rorl $8, %eax
; CHECK64-NEXT: retq
%p1 = getelementptr inbounds i8, ptr %p, i64 1
%l1 = load i8, ptr %p1, align 1
%e1 = zext i8 %l1 to i64

%p2 = getelementptr inbounds i8, ptr %p, i64 2
%l2 = load i8, ptr %p2, align 1
%e2 = zext i8 %l2 to i64
%s2 = shl i64 %e2, 8

%p3 = getelementptr inbounds i8, ptr %p, i64 3
%l3 = load i8, ptr %p3, align 1
%e3 = zext i8 %l3 to i64
%s3 = shl i64 %e3, 16

%p0 = getelementptr inbounds i8, ptr %p, i64 0
%l0 = load i8, ptr %p0, align 1
%e0 = zext i8 %l0 to i64
%s0 = shl i64 %e0, 24

%or1 = or i64 %e1, %s2
%or2 = or i64 %or1, %s3
%or3 = or i64 %or2, %s0
ret i64 %or3
}
Loading