Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26665,11 +26665,34 @@ static SDValue performDUPCombine(SDNode *N,
}

if (N->getOpcode() == AArch64ISD::DUP) {
SDValue Op = N->getOperand(0);

// Optimize DUP(extload/zextload i8/i16) to avoid GPR->FPR transfer.
// For example:
// v4i32 = DUP (i32 (zextloadi8 addr))
// =>
// v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
// v4i32 = DUPLANE32 (v4i32), 0
if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
ISD::LoadExtType ExtType = LD->getExtensionType();
EVT MemVT = LD->getMemoryVT();
EVT ElemVT = VT.getVectorElementType();
if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
(MemVT == MVT::i8 || MemVT == MVT::i16) && ElemVT != MemVT &&
LD->hasOneUse()) {
EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
128 / ElemVT.getSizeInBits());
SDValue ScalarToVec =
DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
DCI.DAG.getConstant(0, DL, MVT::i64));
}
}

// If the instruction is known to produce a scalar in SIMD registers, we can
// duplicate it across the vector lanes using DUPLANE instead of moving it
// to a GPR first. For example, this allows us to handle:
// v4i32 = DUP (i32 (FCMGT (f32, f32)))
SDValue Op = N->getOperand(0);
// FIXME: Ideally, we should be able to handle all instructions that
// produce a scalar value in FPRs.
if (Op.getOpcode() == AArch64ISD::FCMEQ ||
Expand Down
78 changes: 58 additions & 20 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -4004,26 +4004,6 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;

// load zero-extended i32, bitcast to f64
def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;

// load zero-extended i16, bitcast to f64
def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))),
(SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;

// load zero-extended i8, bitcast to f64
def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))),
(SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;

// load zero-extended i16, bitcast to f32
def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;

// load zero-extended i8, bitcast to f32
def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))),
(SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;

// Pre-fetch.
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
[(AArch64Prefetch timm:$Rt,
Expand Down Expand Up @@ -4375,6 +4355,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64
(load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;

// Patterns for bitconvert or scalar_to_vector of load operations.
// Enables direct SIMD register loads for small integer types (i8/i16) that are
// naturally zero-extended to i32/i64.
multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy,
SDPatternOperator OuterOp,
PatFrags LoadOp8, PatFrags LoadOp16> {
// 8-bit loads.
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
(SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem with load patterns is that there are quite a few addressing modes / combinations that we should be supporting but too often do not add patterns for. The combination of all the types gets a bit out of hand. Some patterns should be considered "canonical" though, that we build others on top of.

Is there another basic form of loads we can base these on? If you try and use the extload+bitcast we added lately then those are incomplete (and look wrong to me, I'll make a patch). We could also consider scalar_to_vec(extload) as a base form, if so can you think of a nice templated way to make sure we add all the different addressing forms needed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created a templated form which accepts both scalar_to_vector and bitconvert operations.

def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
(SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))),
(SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))),
(SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>;

// 16-bit loads.
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
(SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
(SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))),
(SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))),
(SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>;
}

// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit.
multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy,
SDPatternOperator OuterOp,
PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> {
defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>;

// 32-bit loads.
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
(SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
(SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))),
(SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>;
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))),
(SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>;
}

// Instantiate bitconvert patterns for floating-point types.
defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>;
defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>;

// Instantiate scalar_to_vector patterns for all vector types.
defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>;
defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>;
defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>;
defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>;
defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>;
defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>;
defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>;
defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>;

// Pre-fetch.
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
[(AArch64Prefetch timm:$Rt,
Expand Down
14 changes: 6 additions & 8 deletions llvm/test/CodeGen/AArch64/aarch64-smull.ll
Original file line number Diff line number Diff line change
Expand Up @@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
; CHECK-NEON-NEXT: ldrh w8, [x0]
; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
; CHECK-NEON-NEXT: ldr h0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
; CHECK-NEON-NEXT: fmov d0, x8
; CHECK-NEON-NEXT: mov v0.d[1], x9
; CHECK-NEON-NEXT: mov v0.d[1], x8
; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
; CHECK-SVE-NEXT: ldrh w8, [x0]
; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
; CHECK-SVE-NEXT: ldr h0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
; CHECK-SVE-NEXT: fmov d0, x8
; CHECK-SVE-NEXT: mov v0.d[1], x9
; CHECK-SVE-NEXT: mov v0.d[1], x8
; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
Expand Down
139 changes: 139 additions & 0 deletions llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s

; Test optimization of DUP with extended narrow loads
; This should avoid GPR->SIMD transfers by loading directly into vector registers

define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i8_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: ret
%load = load i8, ptr %p, align 1
%ext = zext i8 %load to i32
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %dup
}

define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i16_v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: ret
%load = load i16, ptr %p, align 2
%ext = zext i16 %load to i32
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %dup
}

define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i8_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: dup v0.2s, v0.s[0]
; CHECK-NEXT: ret
%load = load i8, ptr %p, align 1
%ext = zext i8 %load to i32
%vec = insertelement <2 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
ret <2 x i32> %dup
}

define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i16_v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0]
; CHECK-NEXT: dup v0.2s, v0.s[0]
; CHECK-NEXT: ret
%load = load i16, ptr %p, align 2
%ext = zext i16 %load to i32
%vec = insertelement <2 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
ret <2 x i32> %dup
}

define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i8_v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: dup v0.8h, v0.h[0]
; CHECK-NEXT: ret
%load = load i8, ptr %p, align 1
%ext = zext i8 %load to i16
%vec = insertelement <8 x i16> poison, i16 %ext, i32 0
%dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
ret <8 x i16> %dup
}

define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i8_v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: dup v0.4h, v0.h[0]
; CHECK-NEXT: ret
%load = load i8, ptr %p, align 1
%ext = zext i8 %load to i16
%vec = insertelement <4 x i16> poison, i16 %ext, i32 0
%dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
ret <4 x i16> %dup
}

define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr b0, [x0, #4]
; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: ret
%addr = getelementptr inbounds i8, ptr %p, i64 4
%load = load i8, ptr %addr, align 1
%ext = zext i8 %load to i32
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %dup
}

define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0, #8]
; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: ret
%addr = getelementptr inbounds i16, ptr %p, i64 4
%load = load i16, ptr %addr, align 2
%ext = zext i16 %load to i32
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %dup
}

define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr b0, [x0, x1]
; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: ret
%addr = getelementptr inbounds i8, ptr %p, i64 %offset
%load = load i8, ptr %addr, align 1
%ext = zext i8 %load to i32
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %dup
}

define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
; CHECK-NEXT: dup v0.4s, v0.s[0]
; CHECK-NEXT: ret
%addr = getelementptr inbounds i16, ptr %p, i64 %offset
%load = load i16, ptr %addr, align 2
%ext = zext i16 %load to i32
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
ret <4 x i32> %dup
}
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AArch64/dup.ll
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ entry:
define <2 x i8> @loaddup_v2i8(ptr %p) {
; CHECK-LABEL: loaddup_v2i8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ldrb w8, [x0]
; CHECK-NEXT: dup v0.2s, w8
; CHECK-NEXT: ldr b0, [x0]
; CHECK-NEXT: dup v0.2s, v0.s[0]
; CHECK-NEXT: ret
entry:
%a = load i8, ptr %p
Expand Down Expand Up @@ -189,8 +189,8 @@ entry:
define <4 x i8> @loaddup_v4i8(ptr %p) {
; CHECK-SD-LABEL: loaddup_v4i8:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldrb w8, [x0]
; CHECK-SD-NEXT: dup v0.4h, w8
; CHECK-SD-NEXT: ldr b0, [x0]
; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v4i8:
Expand Down Expand Up @@ -444,8 +444,8 @@ entry:
define <2 x i16> @loaddup_v2i16(ptr %p) {
; CHECK-SD-LABEL: loaddup_v2i16:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: ldrh w8, [x0]
; CHECK-SD-NEXT: dup v0.2s, w8
; CHECK-SD-NEXT: ldr h0, [x0]
; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v2i16:
Expand Down
Loading