Skip to content

Commit 9904371

Browse files
guy-davidaokblast
authored andcommitted
[AArch64] Optimize splat of extending loads to avoid GPR->FPR transfer (llvm#163067)
Loads the data into the SIMD register, thus sparing a physical register and a potentially costly movement of data.
1 parent 0e79964 commit 9904371

File tree

6 files changed

+291
-57
lines changed

6 files changed

+291
-57
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26725,11 +26725,34 @@ static SDValue performDUPCombine(SDNode *N,
2672526725
}
2672626726

2672726727
if (N->getOpcode() == AArch64ISD::DUP) {
26728+
SDValue Op = N->getOperand(0);
26729+
26730+
// Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer.
26731+
// For example:
26732+
// v4i32 = DUP (i32 (zextloadi8 addr))
26733+
// =>
26734+
// v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
26735+
// v4i32 = DUPLANE32 (v4i32), 0
26736+
if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
26737+
ISD::LoadExtType ExtType = LD->getExtensionType();
26738+
EVT MemVT = LD->getMemoryVT();
26739+
EVT ElemVT = VT.getVectorElementType();
26740+
if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
26741+
(MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
26742+
ElemVT != MemVT && LD->hasOneUse()) {
26743+
EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
26744+
128 / ElemVT.getSizeInBits());
26745+
SDValue ScalarToVec =
26746+
DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
26747+
return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
26748+
DCI.DAG.getConstant(0, DL, MVT::i64));
26749+
}
26750+
}
26751+
2672826752
// If the instruction is known to produce a scalar in SIMD registers, we can
2672926753
// duplicate it across the vector lanes using DUPLANE instead of moving it
2673026754
// to a GPR first. For example, this allows us to handle:
2673126755
// v4i32 = DUP (i32 (FCMGT (f32, f32)))
26732-
SDValue Op = N->getOperand(0);
2673326756
// FIXME: Ideally, we should be able to handle all instructions that
2673426757
// produce a scalar value in FPRs.
2673526758
if (Op.getOpcode() == AArch64ISD::FCMEQ ||

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4022,22 +4022,6 @@ defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
40224022
def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
40234023
(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
40244024

4025-
// load zero-extended i32, bitcast to f64
4026-
def : Pat<(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
4027-
(SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
4028-
// load zero-extended i16, bitcast to f64
4029-
def : Pat<(f64 (bitconvert (i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
4030-
(SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
4031-
// load zero-extended i8, bitcast to f64
4032-
def : Pat<(f64 (bitconvert (i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
4033-
(SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
4034-
// load zero-extended i16, bitcast to f32
4035-
def : Pat<(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
4036-
(SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
4037-
// load zero-extended i8, bitcast to f32
4038-
def : Pat<(f32 (bitconvert (i32 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
4039-
(SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
4040-
40414025
// Pre-fetch.
40424026
def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
40434027
[(AArch64Prefetch timm:$Rt,
@@ -4389,6 +4373,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64
43894373
(load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
43904374
(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
43914375

4376+
// Patterns for bitconvert or scalar_to_vector of load operations.
4377+
// Enables direct SIMD register loads for small integer types (i8/i16) that are
4378+
// naturally zero-extended to i32/i64.
4379+
multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy,
4380+
SDPatternOperator OuterOp,
4381+
PatFrags LoadOp8, PatFrags LoadOp16> {
4382+
// 8-bit loads.
4383+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
4384+
(SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
4385+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
4386+
(SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>;
4387+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))),
4388+
(SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>;
4389+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))),
4390+
(SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>;
4391+
4392+
// 16-bit loads.
4393+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
4394+
(SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
4395+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
4396+
(SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>;
4397+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))),
4398+
(SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>;
4399+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))),
4400+
(SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>;
4401+
}
4402+
4403+
// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit.
4404+
multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy,
4405+
SDPatternOperator OuterOp,
4406+
PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> {
4407+
defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>;
4408+
4409+
// 32-bit loads.
4410+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
4411+
(SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
4412+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
4413+
(SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>;
4414+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))),
4415+
(SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>;
4416+
def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))),
4417+
(SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>;
4418+
}
4419+
4420+
// Instantiate bitconvert patterns for floating-point types.
4421+
defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>;
4422+
defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>;
4423+
4424+
// Instantiate scalar_to_vector patterns for all vector types.
4425+
defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>;
4426+
defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>;
4427+
defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>;
4428+
defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>;
4429+
defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>;
4430+
defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>;
4431+
defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>;
4432+
defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>;
4433+
43924434
// Pre-fetch.
43934435
defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
43944436
[(AArch64Prefetch timm:$Rt,

llvm/test/CodeGen/AArch64/aarch64-smull.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
222222
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
223223
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
224224
; CHECK-NEON: // %bb.0:
225-
; CHECK-NEON-NEXT: ldrh w8, [x0]
226-
; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
225+
; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
226+
; CHECK-NEON-NEXT: ldr h0, [x0]
227227
; CHECK-NEON-NEXT: ldr d1, [x1]
228-
; CHECK-NEON-NEXT: fmov d0, x8
229-
; CHECK-NEON-NEXT: mov v0.d[1], x9
228+
; CHECK-NEON-NEXT: mov v0.d[1], x8
230229
; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
231230
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
232231
; CHECK-NEON-NEXT: ret
233232
;
234233
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
235234
; CHECK-SVE: // %bb.0:
236-
; CHECK-SVE-NEXT: ldrh w8, [x0]
237-
; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
235+
; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
236+
; CHECK-SVE-NEXT: ldr h0, [x0]
238237
; CHECK-SVE-NEXT: ldr d1, [x1]
239-
; CHECK-SVE-NEXT: fmov d0, x8
240-
; CHECK-SVE-NEXT: mov v0.d[1], x9
238+
; CHECK-SVE-NEXT: mov v0.d[1], x8
241239
; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
242240
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
243241
; CHECK-SVE-NEXT: ret
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
3+
4+
; Test optimization of DUP with extended narrow loads
5+
; This should avoid GPR->SIMD transfers by loading directly into vector registers
6+
7+
define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
8+
; CHECK-LABEL: test_dup_zextload_i8_v4i16:
9+
; CHECK: // %bb.0:
10+
; CHECK-NEXT: ldr b0, [x0]
11+
; CHECK-NEXT: dup v0.4h, v0.h[0]
12+
; CHECK-NEXT: ret
13+
%load = load i8, ptr %p, align 1
14+
%ext = zext i8 %load to i16
15+
%vec = insertelement <4 x i16> poison, i16 %ext, i32 0
16+
%dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
17+
ret <4 x i16> %dup
18+
}
19+
20+
define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
21+
; CHECK-LABEL: test_dup_zextload_i8_v8i16:
22+
; CHECK: // %bb.0:
23+
; CHECK-NEXT: ldr b0, [x0]
24+
; CHECK-NEXT: dup v0.8h, v0.h[0]
25+
; CHECK-NEXT: ret
26+
%load = load i8, ptr %p, align 1
27+
%ext = zext i8 %load to i16
28+
%vec = insertelement <8 x i16> poison, i16 %ext, i32 0
29+
%dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
30+
ret <8 x i16> %dup
31+
}
32+
33+
define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
34+
; CHECK-LABEL: test_dup_zextload_i8_v2i32:
35+
; CHECK: // %bb.0:
36+
; CHECK-NEXT: ldr b0, [x0]
37+
; CHECK-NEXT: dup v0.2s, v0.s[0]
38+
; CHECK-NEXT: ret
39+
%load = load i8, ptr %p, align 1
40+
%ext = zext i8 %load to i32
41+
%vec = insertelement <2 x i32> poison, i32 %ext, i32 0
42+
%dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
43+
ret <2 x i32> %dup
44+
}
45+
46+
define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
47+
; CHECK-LABEL: test_dup_zextload_i8_v4i32:
48+
; CHECK: // %bb.0:
49+
; CHECK-NEXT: ldr b0, [x0]
50+
; CHECK-NEXT: dup v0.4s, v0.s[0]
51+
; CHECK-NEXT: ret
52+
%load = load i8, ptr %p, align 1
53+
%ext = zext i8 %load to i32
54+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
55+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
56+
ret <4 x i32> %dup
57+
}
58+
59+
define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
60+
; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
61+
; CHECK: // %bb.0:
62+
; CHECK-NEXT: ldr b0, [x0, #4]
63+
; CHECK-NEXT: dup v0.4s, v0.s[0]
64+
; CHECK-NEXT: ret
65+
%addr = getelementptr inbounds i8, ptr %p, i64 4
66+
%load = load i8, ptr %addr, align 1
67+
%ext = zext i8 %load to i32
68+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
69+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
70+
ret <4 x i32> %dup
71+
}
72+
73+
define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
74+
; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
75+
; CHECK: // %bb.0:
76+
; CHECK-NEXT: ldr b0, [x0, x1]
77+
; CHECK-NEXT: dup v0.4s, v0.s[0]
78+
; CHECK-NEXT: ret
79+
%addr = getelementptr inbounds i8, ptr %p, i64 %offset
80+
%load = load i8, ptr %addr, align 1
81+
%ext = zext i8 %load to i32
82+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
83+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
84+
ret <4 x i32> %dup
85+
}
86+
87+
define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
88+
; CHECK-LABEL: test_dup_zextload_i8_v2i64:
89+
; CHECK: // %bb.0:
90+
; CHECK-NEXT: ldr b0, [x0]
91+
; CHECK-NEXT: dup v0.2d, v0.d[0]
92+
; CHECK-NEXT: ret
93+
%load = load i8, ptr %p, align 1
94+
%ext = zext i8 %load to i64
95+
%vec = insertelement <2 x i64> poison, i64 %ext, i32 0
96+
%dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
97+
ret <2 x i64> %dup
98+
}
99+
100+
define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
101+
; CHECK-LABEL: test_dup_zextload_i16_v2i32:
102+
; CHECK: // %bb.0:
103+
; CHECK-NEXT: ldr h0, [x0]
104+
; CHECK-NEXT: dup v0.2s, v0.s[0]
105+
; CHECK-NEXT: ret
106+
%load = load i16, ptr %p, align 1
107+
%ext = zext i16 %load to i32
108+
%vec = insertelement <2 x i32> poison, i32 %ext, i32 0
109+
%dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
110+
ret <2 x i32> %dup
111+
}
112+
113+
define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
114+
; CHECK-LABEL: test_dup_zextload_i16_v4i32:
115+
; CHECK: // %bb.0:
116+
; CHECK-NEXT: ldr h0, [x0]
117+
; CHECK-NEXT: dup v0.4s, v0.s[0]
118+
; CHECK-NEXT: ret
119+
%load = load i16, ptr %p, align 1
120+
%ext = zext i16 %load to i32
121+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
122+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
123+
ret <4 x i32> %dup
124+
}
125+
126+
define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
127+
; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
128+
; CHECK: // %bb.0:
129+
; CHECK-NEXT: ldr h0, [x0, #8]
130+
; CHECK-NEXT: dup v0.4s, v0.s[0]
131+
; CHECK-NEXT: ret
132+
%addr = getelementptr inbounds i16, ptr %p, i64 4
133+
%load = load i16, ptr %addr, align 1
134+
%ext = zext i16 %load to i32
135+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
136+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
137+
ret <4 x i32> %dup
138+
}
139+
140+
define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
141+
; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
142+
; CHECK: // %bb.0:
143+
; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
144+
; CHECK-NEXT: dup v0.4s, v0.s[0]
145+
; CHECK-NEXT: ret
146+
%addr = getelementptr inbounds i16, ptr %p, i64 %offset
147+
%load = load i16, ptr %addr, align 1
148+
%ext = zext i16 %load to i32
149+
%vec = insertelement <4 x i32> poison, i32 %ext, i32 0
150+
%dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
151+
ret <4 x i32> %dup
152+
}
153+
154+
define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
155+
; CHECK-LABEL: test_dup_zextload_i16_v2i64:
156+
; CHECK: // %bb.0:
157+
; CHECK-NEXT: ldr h0, [x0]
158+
; CHECK-NEXT: dup v0.2d, v0.d[0]
159+
; CHECK-NEXT: ret
160+
%load = load i16, ptr %p, align 1
161+
%ext = zext i16 %load to i64
162+
%vec = insertelement <2 x i64> poison, i64 %ext, i32 0
163+
%dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
164+
ret <2 x i64> %dup
165+
}
166+
167+
define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
168+
; CHECK-LABEL: test_dup_zextload_i32_v2i64:
169+
; CHECK: // %bb.0:
170+
; CHECK-NEXT: ldr s0, [x0]
171+
; CHECK-NEXT: dup v0.2d, v0.d[0]
172+
; CHECK-NEXT: ret
173+
%load = load i32, ptr %p, align 1
174+
%ext = zext i32 %load to i64
175+
%vec = insertelement <2 x i64> poison, i64 %ext, i32 0
176+
%dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
177+
ret <2 x i64> %dup
178+
}

llvm/test/CodeGen/AArch64/dup.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,8 @@ entry:
3232
define <2 x i8> @loaddup_v2i8(ptr %p) {
3333
; CHECK-LABEL: loaddup_v2i8:
3434
; CHECK: // %bb.0: // %entry
35-
; CHECK-NEXT: ldrb w8, [x0]
36-
; CHECK-NEXT: dup v0.2s, w8
35+
; CHECK-NEXT: ldr b0, [x0]
36+
; CHECK-NEXT: dup v0.2s, v0.s[0]
3737
; CHECK-NEXT: ret
3838
entry:
3939
%a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
189189
define <4 x i8> @loaddup_v4i8(ptr %p) {
190190
; CHECK-SD-LABEL: loaddup_v4i8:
191191
; CHECK-SD: // %bb.0: // %entry
192-
; CHECK-SD-NEXT: ldrb w8, [x0]
193-
; CHECK-SD-NEXT: dup v0.4h, w8
192+
; CHECK-SD-NEXT: ldr b0, [x0]
193+
; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
194194
; CHECK-SD-NEXT: ret
195195
;
196196
; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
444444
define <2 x i16> @loaddup_v2i16(ptr %p) {
445445
; CHECK-SD-LABEL: loaddup_v2i16:
446446
; CHECK-SD: // %bb.0: // %entry
447-
; CHECK-SD-NEXT: ldrh w8, [x0]
448-
; CHECK-SD-NEXT: dup v0.2s, w8
447+
; CHECK-SD-NEXT: ldr h0, [x0]
448+
; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
449449
; CHECK-SD-NEXT: ret
450450
;
451451
; CHECK-GI-LABEL: loaddup_v2i16:

0 commit comments

Comments
 (0)