diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b7a60a8..417fdbe1a1cdc 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -26665,11 +26665,34 @@ static SDValue performDUPCombine(SDNode *N, } if (N->getOpcode() == AArch64ISD::DUP) { + SDValue Op = N->getOperand(0); + + // Optimize DUP(extload/zextload i8/i16) to avoid GPR->FPR transfer. + // For example: + // v4i32 = DUP (i32 (zextloadi8 addr)) + // => + // v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0 + // v4i32 = DUPLANE32 (v4i32), 0 + if (auto *LD = dyn_cast(Op)) { + ISD::LoadExtType ExtType = LD->getExtensionType(); + EVT MemVT = LD->getMemoryVT(); + EVT ElemVT = VT.getVectorElementType(); + if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) && + (MemVT == MVT::i8 || MemVT == MVT::i16) && ElemVT != MemVT && + LD->hasOneUse()) { + EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT, + 128 / ElemVT.getSizeInBits()); + SDValue ScalarToVec = + DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op); + return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec, + DCI.DAG.getConstant(0, DL, MVT::i64)); + } + } + // If the instruction is known to produce a scalar in SIMD registers, we can // duplicate it across the vector lanes using DUPLANE instead of moving it // to a GPR first. For example, this allows us to handle: // v4i32 = DUP (i32 (FCMGT (f32, f32))) - SDValue Op = N->getOperand(0); // FIXME: Ideally, we should be able to handle all instructions that // produce a scalar value in FPRs. if (Op.getOpcode() == AArch64ISD::FCMEQ || diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f788c7510f80c..a2a1b43d3a372 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -4004,26 +4004,6 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw", def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))), (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>; -// load zero-extended i32, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; - -// load zero-extended i16, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; - -// load zero-extended i8, bitcast to f64 -def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; - -// load zero-extended i16, bitcast to f32 -def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; - -// load zero-extended i8, bitcast to f32 -def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))), - (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; - // Pre-fetch. def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm", [(AArch64Prefetch timm:$Rt, @@ -4375,6 +4355,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64 (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))), (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>; +// Patterns for bitconvert or scalar_to_vector of load operations. +// Enables direct SIMD register loads for small integer types (i8/i16) that are +// naturally zero-extended to i32/i64. +multiclass ExtLoad8_16AllModes { + // 8-bit loads. + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))), + (SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>; + + // 16-bit loads. + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))), + (SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>; +} + +// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit. +multiclass ExtLoad8_16_32AllModes { + defm : ExtLoad8_16AllModes; + + // 32-bit loads. + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))), + (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))), + (SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>; + def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))), + (SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>; +} + +// Instantiate bitconvert patterns for floating-point types. +defm : ExtLoad8_16AllModes; +defm : ExtLoad8_16_32AllModes; + +// Instantiate scalar_to_vector patterns for all vector types. +defm : ExtLoad8_16AllModes; +defm : ExtLoad8_16AllModes; +defm : ExtLoad8_16AllModes; +defm : ExtLoad8_16AllModes; +defm : ExtLoad8_16AllModes; +defm : ExtLoad8_16AllModes; +defm : ExtLoad8_16_32AllModes; +defm : ExtLoad8_16_32AllModes; + // Pre-fetch. defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum", [(AArch64Prefetch timm:$Rt, diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll index 6e5c666bdbc75..0cd885e599817 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll @@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind { define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind { ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64: ; CHECK-NEON: // %bb.0: -; CHECK-NEON-NEXT: ldrh w8, [x0] -; CHECK-NEON-NEXT: ldrh w9, [x0, #2] +; CHECK-NEON-NEXT: ldrh w8, [x0, #2] +; CHECK-NEON-NEXT: ldr h0, [x0] ; CHECK-NEON-NEXT: ldr d1, [x1] -; CHECK-NEON-NEXT: fmov d0, x8 -; CHECK-NEON-NEXT: mov v0.d[1], x9 +; CHECK-NEON-NEXT: mov v0.d[1], x8 ; CHECK-NEON-NEXT: xtn v0.2s, v0.2d ; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64: ; CHECK-SVE: // %bb.0: -; CHECK-SVE-NEXT: ldrh w8, [x0] -; CHECK-SVE-NEXT: ldrh w9, [x0, #2] +; CHECK-SVE-NEXT: ldrh w8, [x0, #2] +; CHECK-SVE-NEXT: ldr h0, [x0] ; CHECK-SVE-NEXT: ldr d1, [x1] -; CHECK-SVE-NEXT: fmov d0, x8 -; CHECK-SVE-NEXT: mov v0.d[1], x9 +; CHECK-SVE-NEXT: mov v0.d[1], x8 ; CHECK-SVE-NEXT: xtn v0.2s, v0.2d ; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s ; CHECK-SVE-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll new file mode 100644 index 0000000000000..5a54015fcde67 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s + +; Test optimization of DUP with extended narrow loads +; This should avoid GPR->SIMD transfers by loading directly into vector registers + +define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i16_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %load = load i16, ptr %p, align 2 + %ext = zext i16 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer + ret <2 x i32> %dup +} + +define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i16_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0] +; CHECK-NEXT: dup v0.2s, v0.s[0] +; CHECK-NEXT: ret + %load = load i16, ptr %p, align 2 + %ext = zext i16 %load to i32 + %vec = insertelement <2 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer + ret <2 x i32> %dup +} + +define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.8h, v0.h[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i16 + %vec = insertelement <8 x i16> poison, i16 %ext, i32 0 + %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer + ret <8 x i16> %dup +} + +define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.4h, v0.h[0] +; CHECK-NEXT: ret + %load = load i8, ptr %p, align 1 + %ext = zext i8 %load to i16 + %vec = insertelement <4 x i16> poison, i16 %ext, i32 0 + %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer + ret <4 x i16> %dup +} + +define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, #4] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i8, ptr %p, i64 4 + %load = load i8, ptr %addr, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) { +; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, #8] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i16, ptr %p, i64 4 + %load = load i16, ptr %addr, align 2 + %ext = zext i16 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) { +; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr b0, [x0, x1] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i8, ptr %p, i64 %offset + %load = load i8, ptr %addr, align 1 + %ext = zext i8 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} + +define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) { +; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr h0, [x0, x1, lsl #1] +; CHECK-NEXT: dup v0.4s, v0.s[0] +; CHECK-NEXT: ret + %addr = getelementptr inbounds i16, ptr %p, i64 %offset + %load = load i16, ptr %addr, align 2 + %ext = zext i16 %load to i32 + %vec = insertelement <4 x i32> poison, i32 %ext, i32 0 + %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer + ret <4 x i32> %dup +} diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll index 079ff1076b110..670574f24b4a4 100644 --- a/llvm/test/CodeGen/AArch64/dup.ll +++ b/llvm/test/CodeGen/AArch64/dup.ll @@ -32,8 +32,8 @@ entry: define <2 x i8> @loaddup_v2i8(ptr %p) { ; CHECK-LABEL: loaddup_v2i8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ldrb w8, [x0] -; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: ldr b0, [x0] +; CHECK-NEXT: dup v0.2s, v0.s[0] ; CHECK-NEXT: ret entry: %a = load i8, ptr %p @@ -189,8 +189,8 @@ entry: define <4 x i8> @loaddup_v4i8(ptr %p) { ; CHECK-SD-LABEL: loaddup_v4i8: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrb w8, [x0] -; CHECK-SD-NEXT: dup v0.4h, w8 +; CHECK-SD-NEXT: ldr b0, [x0] +; CHECK-SD-NEXT: dup v0.4h, v0.h[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: loaddup_v4i8: @@ -444,8 +444,8 @@ entry: define <2 x i16> @loaddup_v2i16(ptr %p) { ; CHECK-SD-LABEL: loaddup_v2i16: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldrh w8, [x0] -; CHECK-SD-NEXT: dup v0.2s, w8 +; CHECK-SD-NEXT: ldr h0, [x0] +; CHECK-SD-NEXT: dup v0.2s, v0.s[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: loaddup_v2i16: