-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer #163067
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Guy David (guy-david) ChangesLoads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data. Full diff: https://github.com/llvm/llvm-project/pull/163067.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 69651168f8539..67ade1b4bac25 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21737,6 +21737,7 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
+
if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
N0.getOpcode() == AArch64ISD::DUP) {
SDValue Op = N0.getOperand(0);
@@ -26632,11 +26633,34 @@ static SDValue performDUPCombine(SDNode *N,
}
if (N->getOpcode() == AArch64ISD::DUP) {
+ SDValue Op = N->getOperand(0);
+
+ // Optimize DUP(extload/zextload i8/i16) to avoid GPR->FPR transfer.
+ // For example:
+ // v4i32 = DUP (i32 (zextloadi8 addr))
+ // =>
+ // v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
+ // v4i32 = DUPLANE32 (v4i32), 0
+ if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
+ ISD::LoadExtType ExtType = LD->getExtensionType();
+ EVT MemVT = LD->getMemoryVT();
+ EVT ElemVT = VT.getVectorElementType();
+ if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
+ (MemVT == MVT::i8 || MemVT == MVT::i16) && ElemVT != MemVT &&
+ LD->hasOneUse()) {
+ EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
+ 128 / ElemVT.getSizeInBits());
+ SDValue ScalarToVec =
+ DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
+ return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
+ DCI.DAG.getConstant(0, DL, MVT::i64));
+ }
+ }
+
// If the instruction is known to produce a scalar in SIMD registers, we can
// duplicate it across the vector lanes using DUPLANE instead of moving it
// to a GPR first. For example, this allows us to handle:
// v4i32 = DUP (i32 (FCMGT (f32, f32)))
- SDValue Op = N->getOperand(0);
// FIXME: Ideally, we should be able to handle all instructions that
// produce a scalar value in FPRs.
if (Op.getOpcode() == AArch64ISD::FCMEQ ||
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f788c7510f80c..02de35ee053f9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4375,6 +4375,26 @@ def : Pat <(v1i64 (scalar_to_vector (i64
(load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
+// Patterns for scalar_to_vector with zero-extended loads.
+// Enables direct SIMD register loads for small integer types (i8/i16) that are
+// naturally zero-extended to i32/i64.
+multiclass ScalarToVectorExtLoad<ValueType VecTy, ValueType ScalarTy> {
+ def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+ def : Pat<(VecTy (scalar_to_vector (ScalarTy (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
+ def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+ def : Pat<(VecTy (scalar_to_vector (ScalarTy (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+ (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+}
+
+defm : ScalarToVectorExtLoad<v16i8, i32>;
+defm : ScalarToVectorExtLoad<v8i16, i32>;
+defm : ScalarToVectorExtLoad<v4i32, i32>;
+defm : ScalarToVectorExtLoad<v2i64, i64>;
+
// Pre-fetch.
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
[(AArch64Prefetch timm:$Rt,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666bdbc75..0cd885e599817 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: ldrh w8, [x0]
-; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT: ldr h0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
-; CHECK-NEON-NEXT: fmov d0, x8
-; CHECK-NEON-NEXT: mov v0.d[1], x9
+; CHECK-NEON-NEXT: mov v0.d[1], x8
; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: ldrh w8, [x0]
-; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT: ldr h0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
-; CHECK-SVE-NEXT: fmov d0, x8
-; CHECK-SVE-NEXT: mov v0.d[1], x9
+; CHECK-SVE-NEXT: mov v0.d[1], x8
; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000000000..5d9908b0768ae
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 2
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <2 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer
+ ret <2 x i32> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 2
+ %ext = zext i16 %load to i32
+ %vec = insertelement <2 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer
+ ret <2 x i32> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.8h, v0.h[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i16
+ %vec = insertelement <8 x i16> undef, i16 %ext, i32 0
+ %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
+ ret <8 x i16> %dup
+}
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.4h, v0.h[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i16
+ %vec = insertelement <4 x i16> undef, i16 %ext, i32 0
+ %dup = shufflevector <4 x i16> %vec, <4 x i16> undef, <4 x i32> zeroinitializer
+ ret <4 x i16> %dup
+}
+
+; Test with offset addressing
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0, #4]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i8, ptr %p, i64 4
+ %load = load i8, ptr %addr, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0, #8]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i16, ptr %p, i64 4
+ %load = load i16, ptr %addr, align 2
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+; Test with register offset addressing
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0, x1]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+ %load = load i8, ptr %addr, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+ %load = load i16, ptr %addr, align 2
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+; Negative test: sign-extended loads should not use this optimization
+define <4 x i32> @test_dup_sextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_sextload_i8_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldrsb w8, [x0]
+; CHECK-NEXT: dup v0.4s, w8
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = sext i8 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+; Negative test: i32 loads don't need this optimization
+define <4 x i32> @test_dup_load_i32_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_load_i32_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1r { v0.4s }, [x0]
+; CHECK-NEXT: ret
+ %load = load i32, ptr %p, align 4
+ %vec = insertelement <4 x i32> undef, i32 %load, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+; Test that truncate(dup(zextload)) doesn't generate unnecessary XTN
+define <8 x i8> @test_truncate_dup_zextload_i8_v8i8(ptr %p) {
+; CHECK-LABEL: test_truncate_dup_zextload_i8_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1r { v0.8b }, [x0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i16
+ %vec = insertelement <8 x i16> undef, i16 %ext, i32 0
+ %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
+ %trunc = trunc <8 x i16> %dup to <8 x i8>
+ ret <8 x i8> %trunc
+}
+
+; Test with i16 to i8 truncation
+define <8 x i8> @test_truncate_dup_zextload_i8_from_i32_v8i8(ptr %p) {
+; CHECK-LABEL: test_truncate_dup_zextload_i8_from_i32_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1r { v0.8b }, [x0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ %trunc = trunc <4 x i32> %dup to <4 x i8>
+ ; Widen to v8i8 to match the test output
+ %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i8> %result
+}
+
+; Test with i16 load truncated to i8
+define <8 x i8> @test_truncate_dup_zextload_i16_to_i8_v8i8(ptr %p) {
+; CHECK-LABEL: test_truncate_dup_zextload_i16_to_i8_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ld1r { v0.8b }, [x0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 2
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ %trunc = trunc <4 x i32> %dup to <4 x i8>
+ ; Widen to v8i8 to match the test output
+ %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i8> %result
+}
+
+; Test generalized truncate(dup(scalar_to_vector)) for non-load case
+define <8 x i8> @test_truncate_dup_scalar_i32_to_i8_v8i8(i32 %val) {
+; CHECK-LABEL: test_truncate_dup_scalar_i32_to_i8_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v0.8b, w0
+; CHECK-NEXT: ret
+ %vec = insertelement <4 x i32> undef, i32 %val, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+ %trunc = trunc <4 x i32> %dup to <4 x i8>
+ ; Widen to v8i8 to match the test output
+ %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <8 x i8> %result
+}
+
+; Test generalized truncate(dup(scalar_to_vector)) i16 to i8
+define <8 x i8> @test_truncate_dup_scalar_i16_to_i8_v8i8(i16 %val) {
+; CHECK-LABEL: test_truncate_dup_scalar_i16_to_i8_v8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: dup v0.8b, w0
+; CHECK-NEXT: ret
+ %vec = insertelement <8 x i16> undef, i16 %val, i32 0
+ %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
+ %trunc = trunc <8 x i16> %dup to <8 x i8>
+ ret <8 x i8> %trunc
+}
+
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff1076b110..670574f24b4a4 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
define <2 x i8> @loaddup_v2i8(ptr %p) {
; CHECK-LABEL: loaddup_v2i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: dup v0.2s, w8
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
; CHECK-NEXT: ret
entry:
%a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
define <4 x i8> @loaddup_v4i8(ptr %p) {
; CHECK-SD-LABEL: loaddup_v4i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: dup v0.4h, w8
+; CHECK-SD-NEXT: ldr b0, [x0]
+; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
define <2 x i16> @loaddup_v2i16(ptr %p) {
; CHECK-SD-LABEL: loaddup_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: dup v0.2s, w8
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v2i16:
|
✅ With the latest revision this PR passed the undef deprecator. |
c0d78a3
to
8bb10cf
Compare
8bb10cf
to
9aa805d
Compare
// naturally zero-extended to i32/i64. | ||
multiclass ScalarToVectorExtLoad<ValueType VecTy, ValueType ScalarTy> { | ||
def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))), | ||
(SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The problem with load patterns is that there are quite a few addressing modes / combinations that we should be supporting but too often do not add patterns for. The combination of all the types gets a bit out of hand. Some patterns should be considered "canonical" though, that we build others on top of.
Is there another basic form of loads we can base these on? If you try and use the extload+bitcast we added lately then those are incomplete (and look wrong to me, I'll make a patch). We could also consider scalar_to_vec(extload) as a base form, if so can you think of a nice templated way to make sure we add all the different addressing forms needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Created a templated form which accepts both scalar_to_vector
and bitconvert
operations.
Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data. Consolidated into a template which also handles a similar bitconvert pattern.
9aa805d
to
4b18219
Compare
Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data.