[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer #163067

guy-david · 2025-10-12T13:02:53Z

Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data.

llvmbot · 2025-10-12T13:03:27Z

@llvm/pr-subscribers-backend-aarch64

Author: Guy David (guy-david)

Changes

Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data.

Full diff: https://github.com/llvm/llvm-project/pull/163067.diff

5 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+25-1)
(modified) llvm/lib/Target/AArch64/AArch64InstrInfo.td (+20)
(modified) llvm/test/CodeGen/AArch64/aarch64-smull.ll (+6-8)
(added) llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll (+240)
(modified) llvm/test/CodeGen/AArch64/dup.ll (+6-6)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 69651168f8539..67ade1b4bac25 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21737,6 +21737,7 @@ static SDValue performTruncateCombine(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
+
   if (VT.isFixedLengthVector() && VT.is64BitVector() && N0.hasOneUse() &&
       N0.getOpcode() == AArch64ISD::DUP) {
     SDValue Op = N0.getOperand(0);
@@ -26632,11 +26633,34 @@ static SDValue performDUPCombine(SDNode *N,
   }
 
   if (N->getOpcode() == AArch64ISD::DUP) {
+    SDValue Op = N->getOperand(0);
+
+    // Optimize DUP(extload/zextload i8/i16) to avoid GPR->FPR transfer.
+    // For example:
+    //   v4i32 = DUP (i32 (zextloadi8 addr))
+    // =>
+    //   v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
+    //   v4i32 = DUPLANE32 (v4i32), 0
+    if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
+      ISD::LoadExtType ExtType = LD->getExtensionType();
+      EVT MemVT = LD->getMemoryVT();
+      EVT ElemVT = VT.getVectorElementType();
+      if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
+          (MemVT == MVT::i8 || MemVT == MVT::i16) && ElemVT != MemVT &&
+          LD->hasOneUse()) {
+        EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
+                                        128 / ElemVT.getSizeInBits());
+        SDValue ScalarToVec =
+            DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
+        return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
+                               DCI.DAG.getConstant(0, DL, MVT::i64));
+      }
+    }
+
     // If the instruction is known to produce a scalar in SIMD registers, we can
     // duplicate it across the vector lanes using DUPLANE instead of moving it
     // to a GPR first. For example, this allows us to handle:
     //   v4i32 = DUP (i32 (FCMGT (f32, f32)))
-    SDValue Op = N->getOperand(0);
     // FIXME: Ideally, we should be able to handle all instructions that
     // produce a scalar value in FPRs.
     if (Op.getOpcode() == AArch64ISD::FCMEQ ||
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f788c7510f80c..02de35ee053f9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -4375,6 +4375,26 @@ def : Pat <(v1i64 (scalar_to_vector (i64
                (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
            (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
 
+// Patterns for scalar_to_vector with zero-extended loads.
+// Enables direct SIMD register loads for small integer types (i8/i16) that are
+// naturally zero-extended to i32/i64.
+multiclass ScalarToVectorExtLoad<ValueType VecTy, ValueType ScalarTy> {
+  def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+  def : Pat<(VecTy (scalar_to_vector (ScalarTy (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+
+  def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+  def : Pat<(VecTy (scalar_to_vector (ScalarTy (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+}
+
+defm : ScalarToVectorExtLoad<v16i8, i32>;
+defm : ScalarToVectorExtLoad<v8i16, i32>;
+defm : ScalarToVectorExtLoad<v4i32, i32>;
+defm : ScalarToVectorExtLoad<v2i64, i64>;
+
 // Pre-fetch.
 defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
                   [(AArch64Prefetch timm:$Rt,
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666bdbc75..0cd885e599817 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0]
-; CHECK-NEON-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT:    ldr h0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    fmov d0, x8
-; CHECK-NEON-NEXT:    mov v0.d[1], x9
+; CHECK-NEON-NEXT:    mov v0.d[1], x8
 ; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0]
-; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT:    ldr h0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    fmov d0, x8
-; CHECK-SVE-NEXT:    mov v0.d[1], x9
+; CHECK-SVE-NEXT:    mov v0.d[1], x8
 ; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000000000..5d9908b0768ae
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 2
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <2 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 2
+  %ext = zext i16 %load to i32
+  %vec = insertelement <2 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <8 x i16> undef, i16 %ext, i32 0
+  %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
+  ret <8 x i16> %dup
+}
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <4 x i16> undef, i16 %ext, i32 0
+  %dup = shufflevector <4 x i16> %vec, <4 x i16> undef, <4 x i32> zeroinitializer
+  ret <4 x i16> %dup
+}
+
+; Test with offset addressing
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #4]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 4
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 4
+  %load = load i16, ptr %addr, align 2
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+; Test with register offset addressing
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+  %load = load i16, ptr %addr, align 2
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+; Negative test: sign-extended loads should not use this optimization
+define <4 x i32> @test_dup_sextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_sextload_i8_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrsb w8, [x0]
+; CHECK-NEXT:    dup v0.4s, w8
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = sext i8 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+; Negative test: i32 loads don't need this optimization
+define <4 x i32> @test_dup_load_i32_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_load_i32_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1r { v0.4s }, [x0]
+; CHECK-NEXT:    ret
+  %load = load i32, ptr %p, align 4
+  %vec = insertelement <4 x i32> undef, i32 %load, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+; Test that truncate(dup(zextload)) doesn't generate unnecessary XTN
+define <8 x i8> @test_truncate_dup_zextload_i8_v8i8(ptr %p) {
+; CHECK-LABEL: test_truncate_dup_zextload_i8_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <8 x i16> undef, i16 %ext, i32 0
+  %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
+  %trunc = trunc <8 x i16> %dup to <8 x i8>
+  ret <8 x i8> %trunc
+}
+
+; Test with i16 to i8 truncation
+define <8 x i8> @test_truncate_dup_zextload_i8_from_i32_v8i8(ptr %p) {
+; CHECK-LABEL: test_truncate_dup_zextload_i8_from_i32_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  %trunc = trunc <4 x i32> %dup to <4 x i8>
+  ; Widen to v8i8 to match the test output
+  %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i8> %result
+}
+
+; Test with i16 load truncated to i8
+define <8 x i8> @test_truncate_dup_zextload_i16_to_i8_v8i8(ptr %p) {
+; CHECK-LABEL: test_truncate_dup_zextload_i16_to_i8_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1r { v0.8b }, [x0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 2
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> undef, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  %trunc = trunc <4 x i32> %dup to <4 x i8>
+  ; Widen to v8i8 to match the test output
+  %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i8> %result
+}
+
+; Test generalized truncate(dup(scalar_to_vector)) for non-load case
+define <8 x i8> @test_truncate_dup_scalar_i32_to_i8_v8i8(i32 %val) {
+; CHECK-LABEL: test_truncate_dup_scalar_i32_to_i8_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8b, w0
+; CHECK-NEXT:    ret
+  %vec = insertelement <4 x i32> undef, i32 %val, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> undef, <4 x i32> zeroinitializer
+  %trunc = trunc <4 x i32> %dup to <4 x i8>
+  ; Widen to v8i8 to match the test output
+  %result = shufflevector <4 x i8> %trunc, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i8> %result
+}
+
+; Test generalized truncate(dup(scalar_to_vector)) i16 to i8
+define <8 x i8> @test_truncate_dup_scalar_i16_to_i8_v8i8(i16 %val) {
+; CHECK-LABEL: test_truncate_dup_scalar_i16_to_i8_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.8b, w0
+; CHECK-NEXT:    ret
+  %vec = insertelement <8 x i16> undef, i16 %val, i32 0
+  %dup = shufflevector <8 x i16> %vec, <8 x i16> undef, <8 x i32> zeroinitializer
+  %trunc = trunc <8 x i16> %dup to <8 x i8>
+  ret <8 x i8> %trunc
+}
+
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff1076b110..670574f24b4a4 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
 define <2 x i8> @loaddup_v2i8(ptr %p) {
 ; CHECK-LABEL: loaddup_v2i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    dup v0.2s, w8
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    ret
 entry:
   %a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
 define <4 x i8> @loaddup_v4i8(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v4i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    dup v0.4h, w8
+; CHECK-SD-NEXT:    ldr b0, [x0]
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
 define <2 x i16> @loaddup_v2i16(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    dup v0.2s, w8
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v2i16:

github-actions · 2025-10-12T13:06:21Z

✅ With the latest revision this PR passed the undef deprecator.

davemgreen · 2025-10-22T07:16:50Z

llvm/lib/Target/AArch64/AArch64InstrInfo.td

+// naturally zero-extended to i32/i64.
+multiclass ScalarToVectorExtLoad<ValueType VecTy, ValueType ScalarTy> {
+  def : Pat<(VecTy (scalar_to_vector (ScalarTy (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;


The problem with load patterns is that there are quite a few addressing modes / combinations that we should be supporting but too often do not add patterns for. The combination of all the types gets a bit out of hand. Some patterns should be considered "canonical" though, that we build others on top of.

Is there another basic form of loads we can base these on? If you try and use the extload+bitcast we added lately then those are incomplete (and look wrong to me, I'll make a patch). We could also consider scalar_to_vec(extload) as a base form, if so can you think of a nice templated way to make sure we add all the different addressing forms needed?

Created a templated form which accepts both scalar_to_vector and bitconvert operations.

Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data. Consolidated into a template which also handles a similar bitconvert pattern.

guy-david requested a review from davemgreen October 12, 2025 13:02

llvmbot added the backend:AArch64 label Oct 12, 2025

guy-david force-pushed the users/guy-david/aarch64-dup-zextload branch 2 times, most recently from c0d78a3 to 8bb10cf Compare October 12, 2025 13:09

guy-david force-pushed the users/guy-david/aarch64-dup-zextload branch from 8bb10cf to 9aa805d Compare October 20, 2025 13:01

davemgreen reviewed Oct 22, 2025

View reviewed changes

[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer

4b18219

Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data. Consolidated into a template which also handles a similar bitconvert pattern.

guy-david force-pushed the users/guy-david/aarch64-dup-zextload branch from 9aa805d to 4b18219 Compare October 22, 2025 13:11

guy-david changed the title ~~[AArch64] Optimize DUP of extending loads to avoid GPR->FPR transfer~~ [AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer Oct 22, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer #163067

[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer #163067

guy-david commented Oct 12, 2025

Uh oh!

llvmbot commented Oct 12, 2025

Uh oh!

github-actions bot commented Oct 12, 2025 •

edited

Loading

Uh oh!

davemgreen Oct 22, 2025

Uh oh!

guy-david Oct 22, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer #163067

Are you sure you want to change the base?

[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer #163067

Conversation

guy-david commented Oct 12, 2025

Uh oh!

llvmbot commented Oct 12, 2025

Uh oh!

github-actions bot commented Oct 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

davemgreen Oct 22, 2025

Choose a reason for hiding this comment

Uh oh!

guy-david Oct 22, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

github-actions bot commented Oct 12, 2025 •

edited

Loading