Skip to content

Commit 1034bb5

Browse files
authored
[AArch64] Use SVE to materialise some 128-bit vector constants (#159101)
There is no easy way to materialise some fixed-width vector constants with 64-bit elements. This is because NEON's movi instruction is restricted to setting all bits in a byte to the same value, i.e. 0xFF can be encoded as an immediate but not 0x1F. However, if SVE is available we can use the dup instruction to cover more cases. Rather than lower the immediate directly using the dup instruction, I've instead used the generic SPLAT_VECTOR node in combination with an EXTRACT_SUBVECTOR. This is because we already have SVE splat_vector patterns that can match directly to dup.
1 parent 90db629 commit 1034bb5

File tree

5 files changed

+302
-30
lines changed

5 files changed

+302
-30
lines changed

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4346,34 +4346,14 @@ bool AArch64DAGToDAGISel::SelectSVECpyDupImm(SDValue N, MVT VT, SDValue &Imm,
43464346
->getAPIntValue()
43474347
.trunc(VT.getFixedSizeInBits())
43484348
.getSExtValue();
4349+
int32_t ImmVal, ShiftVal;
4350+
if (!AArch64_AM::isSVECpyDupImm(VT.getScalarSizeInBits(), Val, ImmVal,
4351+
ShiftVal))
4352+
return false;
43494353

4350-
switch (VT.SimpleTy) {
4351-
case MVT::i8:
4352-
// All immediates are supported.
4353-
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
4354-
Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
4355-
return true;
4356-
case MVT::i16:
4357-
case MVT::i32:
4358-
case MVT::i64:
4359-
// Support 8bit signed immediates.
4360-
if (Val >= -128 && Val <= 127) {
4361-
Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
4362-
Imm = CurDAG->getTargetConstant(Val & 0xFF, DL, MVT::i32);
4363-
return true;
4364-
}
4365-
// Support 16bit signed immediates that are a multiple of 256.
4366-
if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
4367-
Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
4368-
Imm = CurDAG->getTargetConstant((Val >> 8) & 0xFF, DL, MVT::i32);
4369-
return true;
4370-
}
4371-
break;
4372-
default:
4373-
break;
4374-
}
4375-
4376-
return false;
4354+
Shift = CurDAG->getTargetConstant(ShiftVal, DL, MVT::i32);
4355+
Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
4356+
return true;
43774357
}
43784358

43794359
bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15270,6 +15270,27 @@ static SDValue NormalizeBuildVector(SDValue Op,
1527015270
return DAG.getBuildVector(VT, DL, Ops);
1527115271
}
1527215272

15273+
static SDValue trySVESplat64(SDValue Op, SelectionDAG &DAG,
15274+
const AArch64Subtarget *ST, APInt &DefBits) {
15275+
EVT VT = Op.getValueType();
15276+
// TODO: We should be able to support 64-bit destinations too
15277+
if (!ST->hasSVE() || !VT.is128BitVector() ||
15278+
DefBits.getHiBits(64) != DefBits.getLoBits(64))
15279+
return SDValue();
15280+
15281+
// See if we can make use of the SVE dup instruction.
15282+
APInt Val64 = DefBits.trunc(64);
15283+
int32_t ImmVal, ShiftVal;
15284+
if (!AArch64_AM::isSVECpyDupImm(64, Val64.getSExtValue(), ImmVal, ShiftVal))
15285+
return SDValue();
15286+
15287+
SDLoc DL(Op);
15288+
SDValue SplatVal = DAG.getSplatVector(MVT::nxv2i64, DL,
15289+
DAG.getConstant(Val64, DL, MVT::i64));
15290+
SDValue Res = convertFromScalableVector(DAG, MVT::v2i64, SplatVal);
15291+
return DAG.getNode(AArch64ISD::NVCAST, DL, VT, Res);
15292+
}
15293+
1527315294
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
1527415295
const AArch64Subtarget *ST) {
1527515296
EVT VT = Op.getValueType();
@@ -15309,6 +15330,10 @@ static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
1530915330
if (SDValue R = TryMOVIWithBits(UndefBits))
1531015331
return R;
1531115332

15333+
// Try to materialise the constant using SVE when available.
15334+
if (SDValue R = trySVESplat64(Op, DAG, ST, DefBits))
15335+
return R;
15336+
1531215337
// See if a fneg of the constant can be materialized with a MOVI, etc
1531315338
auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
1531415339
// FNegate each sub-element of the constant

llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -871,6 +871,36 @@ inline static bool isAnyMOVWMovAlias(uint64_t Value, int RegWidth) {
871871
return isAnyMOVZMovAlias(Value, RegWidth);
872872
}
873873

874+
static inline bool isSVECpyDupImm(int SizeInBits, int64_t Val, int32_t &Imm,
875+
int32_t &Shift) {
876+
switch (SizeInBits) {
877+
case 8:
878+
// All immediates are supported.
879+
Shift = 0;
880+
Imm = Val & 0xFF;
881+
return true;
882+
case 16:
883+
case 32:
884+
case 64:
885+
// Support 8bit signed immediates.
886+
if (Val >= -128 && Val <= 127) {
887+
Shift = 0;
888+
Imm = Val & 0xFF;
889+
return true;
890+
}
891+
// Support 16bit signed immediates that are a multiple of 256.
892+
if (Val >= -32768 && Val <= 32512 && Val % 256 == 0) {
893+
Shift = 8;
894+
Imm = (Val >> 8) & 0xFF;
895+
return true;
896+
}
897+
break;
898+
default:
899+
break;
900+
}
901+
return false;
902+
}
903+
874904
} // end namespace AArch64_AM
875905

876906
} // end namespace llvm

llvm/test/CodeGen/AArch64/extract-vector-cmp.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,9 @@ define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
7575
; CHECK-LABEL: vector_loop_with_icmp:
7676
; CHECK: // %bb.0: // %entry
7777
; CHECK-NEXT: index z0.d, #0, #1
78-
; CHECK-NEXT: mov w8, #2 // =0x2
79-
; CHECK-NEXT: mov w9, #16 // =0x10
80-
; CHECK-NEXT: dup v1.2d, x8
78+
; CHECK-NEXT: mov z1.d, #2 // =0x2
8179
; CHECK-NEXT: add x8, x0, #4
80+
; CHECK-NEXT: mov w9, #16 // =0x10
8281
; CHECK-NEXT: mov w10, #1 // =0x1
8382
; CHECK-NEXT: b .LBB5_2
8483
; CHECK-NEXT: .LBB5_1: // %pred.store.continue6
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc -mtriple=aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=COMMON,NEON
3+
; RUN: llc -mtriple=aarch64 -mattr=+neon,+sve < %s | FileCheck %s --check-prefixes=COMMON,SVE
4+
5+
define <2 x i64> @movi_1_v2i64() {
6+
; NEON-LABEL: movi_1_v2i64:
7+
; NEON: // %bb.0:
8+
; NEON-NEXT: mov w8, #1 // =0x1
9+
; NEON-NEXT: dup v0.2d, x8
10+
; NEON-NEXT: ret
11+
;
12+
; SVE-LABEL: movi_1_v2i64:
13+
; SVE: // %bb.0:
14+
; SVE-NEXT: mov z0.d, #1 // =0x1
15+
; SVE-NEXT: ret
16+
ret <2 x i64> splat (i64 1)
17+
}
18+
19+
define <2 x i64> @movi_127_v2i64() {
20+
; NEON-LABEL: movi_127_v2i64:
21+
; NEON: // %bb.0:
22+
; NEON-NEXT: mov w8, #127 // =0x7f
23+
; NEON-NEXT: dup v0.2d, x8
24+
; NEON-NEXT: ret
25+
;
26+
; SVE-LABEL: movi_127_v2i64:
27+
; SVE: // %bb.0:
28+
; SVE-NEXT: mov z0.d, #127 // =0x7f
29+
; SVE-NEXT: ret
30+
ret <2 x i64> splat (i64 127)
31+
}
32+
33+
define <2 x i64> @movi_m128_v2i64() {
34+
; NEON-LABEL: movi_m128_v2i64:
35+
; NEON: // %bb.0:
36+
; NEON-NEXT: mov x8, #-128 // =0xffffffffffffff80
37+
; NEON-NEXT: dup v0.2d, x8
38+
; NEON-NEXT: ret
39+
;
40+
; SVE-LABEL: movi_m128_v2i64:
41+
; SVE: // %bb.0:
42+
; SVE-NEXT: mov z0.d, #-128 // =0xffffffffffffff80
43+
; SVE-NEXT: ret
44+
ret <2 x i64> splat (i64 -128)
45+
}
46+
47+
define <2 x i64> @movi_256_v2i64() {
48+
; NEON-LABEL: movi_256_v2i64:
49+
; NEON: // %bb.0:
50+
; NEON-NEXT: mov w8, #256 // =0x100
51+
; NEON-NEXT: dup v0.2d, x8
52+
; NEON-NEXT: ret
53+
;
54+
; SVE-LABEL: movi_256_v2i64:
55+
; SVE: // %bb.0:
56+
; SVE-NEXT: mov z0.d, #256 // =0x100
57+
; SVE-NEXT: ret
58+
ret <2 x i64> splat (i64 256)
59+
}
60+
61+
define <2 x i64> @movi_32512_v2i64() {
62+
; NEON-LABEL: movi_32512_v2i64:
63+
; NEON: // %bb.0:
64+
; NEON-NEXT: mov w8, #32512 // =0x7f00
65+
; NEON-NEXT: dup v0.2d, x8
66+
; NEON-NEXT: ret
67+
;
68+
; SVE-LABEL: movi_32512_v2i64:
69+
; SVE: // %bb.0:
70+
; SVE-NEXT: mov z0.d, #32512 // =0x7f00
71+
; SVE-NEXT: ret
72+
ret <2 x i64> splat (i64 32512)
73+
}
74+
75+
define <2 x i64> @movi_m32768_v2i64() {
76+
; NEON-LABEL: movi_m32768_v2i64:
77+
; NEON: // %bb.0:
78+
; NEON-NEXT: mov x8, #-32768 // =0xffffffffffff8000
79+
; NEON-NEXT: dup v0.2d, x8
80+
; NEON-NEXT: ret
81+
;
82+
; SVE-LABEL: movi_m32768_v2i64:
83+
; SVE: // %bb.0:
84+
; SVE-NEXT: mov z0.d, #-32768 // =0xffffffffffff8000
85+
; SVE-NEXT: ret
86+
ret <2 x i64> splat (i64 -32768)
87+
}
88+
89+
; Special cases where the destination vector does not have 64-bit elements
90+
91+
define <4 x i32> @movi_v4i32_1() {
92+
; NEON-LABEL: movi_v4i32_1:
93+
; NEON: // %bb.0:
94+
; NEON-NEXT: adrp x8, .LCPI6_0
95+
; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI6_0]
96+
; NEON-NEXT: ret
97+
;
98+
; SVE-LABEL: movi_v4i32_1:
99+
; SVE: // %bb.0:
100+
; SVE-NEXT: mov z0.d, #127 // =0x7f
101+
; SVE-NEXT: ret
102+
ret <4 x i32> <i32 127, i32 0, i32 127, i32 0>
103+
}
104+
105+
define <4 x i32> @movi_v4i32_2() {
106+
; NEON-LABEL: movi_v4i32_2:
107+
; NEON: // %bb.0:
108+
; NEON-NEXT: adrp x8, .LCPI7_0
109+
; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI7_0]
110+
; NEON-NEXT: ret
111+
;
112+
; SVE-LABEL: movi_v4i32_2:
113+
; SVE: // %bb.0:
114+
; SVE-NEXT: mov z0.d, #32512 // =0x7f00
115+
; SVE-NEXT: ret
116+
ret <4 x i32> <i32 32512, i32 0, i32 32512, i32 0>
117+
}
118+
119+
define <8 x i16> @movi_v8i16_1() {
120+
; NEON-LABEL: movi_v8i16_1:
121+
; NEON: // %bb.0:
122+
; NEON-NEXT: adrp x8, .LCPI8_0
123+
; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI8_0]
124+
; NEON-NEXT: ret
125+
;
126+
; SVE-LABEL: movi_v8i16_1:
127+
; SVE: // %bb.0:
128+
; SVE-NEXT: mov z0.d, #127 // =0x7f
129+
; SVE-NEXT: ret
130+
ret <8 x i16> <i16 127, i16 0, i16 0, i16 0, i16 127, i16 0, i16 0, i16 0>
131+
}
132+
133+
define <8 x i16> @movi_v8i16_2() {
134+
; NEON-LABEL: movi_v8i16_2:
135+
; NEON: // %bb.0:
136+
; NEON-NEXT: adrp x8, .LCPI9_0
137+
; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI9_0]
138+
; NEON-NEXT: ret
139+
;
140+
; SVE-LABEL: movi_v8i16_2:
141+
; SVE: // %bb.0:
142+
; SVE-NEXT: mov z0.d, #32512 // =0x7f00
143+
; SVE-NEXT: ret
144+
ret <8 x i16> <i16 32512, i16 0, i16 0, i16 0, i16 32512, i16 0, i16 0, i16 0>
145+
}
146+
147+
define <16 x i8> @movi_v16i8_1() {
148+
; NEON-LABEL: movi_v16i8_1:
149+
; NEON: // %bb.0:
150+
; NEON-NEXT: adrp x8, .LCPI10_0
151+
; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI10_0]
152+
; NEON-NEXT: ret
153+
;
154+
; SVE-LABEL: movi_v16i8_1:
155+
; SVE: // %bb.0:
156+
; SVE-NEXT: mov z0.d, #127 // =0x7f
157+
; SVE-NEXT: ret
158+
ret <16 x i8> <i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
159+
}
160+
161+
define <16 x i8> @movi_v16i8_2() {
162+
; NEON-LABEL: movi_v16i8_2:
163+
; NEON: // %bb.0:
164+
; NEON-NEXT: adrp x8, .LCPI11_0
165+
; NEON-NEXT: ldr q0, [x8, :lo12:.LCPI11_0]
166+
; NEON-NEXT: ret
167+
;
168+
; SVE-LABEL: movi_v16i8_2:
169+
; SVE: // %bb.0:
170+
; SVE-NEXT: mov z0.d, #32512 // =0x7f00
171+
; SVE-NEXT: ret
172+
ret <16 x i8> <i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
173+
}
174+
175+
; Negative cases
176+
177+
define <2 x i64> @movi_128_v2i64() {
178+
; COMMON-LABEL: movi_128_v2i64:
179+
; COMMON: // %bb.0:
180+
; COMMON-NEXT: mov w8, #128 // =0x80
181+
; COMMON-NEXT: dup v0.2d, x8
182+
; COMMON-NEXT: ret
183+
ret <2 x i64> splat (i64 128)
184+
}
185+
186+
define <2 x i64> @movi_m127_v2i64() {
187+
; COMMON-LABEL: movi_m127_v2i64:
188+
; COMMON: // %bb.0:
189+
; COMMON-NEXT: mov x8, #-129 // =0xffffffffffffff7f
190+
; COMMON-NEXT: dup v0.2d, x8
191+
; COMMON-NEXT: ret
192+
ret <2 x i64> splat (i64 -129)
193+
}
194+
195+
define <2 x i64> @movi_32513_v2i64() {
196+
; COMMON-LABEL: movi_32513_v2i64:
197+
; COMMON: // %bb.0:
198+
; COMMON-NEXT: mov w8, #32513 // =0x7f01
199+
; COMMON-NEXT: dup v0.2d, x8
200+
; COMMON-NEXT: ret
201+
ret <2 x i64> splat (i64 32513)
202+
}
203+
204+
define <2 x i64> @movi_m32769_v2i64() {
205+
; COMMON-LABEL: movi_m32769_v2i64:
206+
; COMMON: // %bb.0:
207+
; COMMON-NEXT: mov x8, #-32769 // =0xffffffffffff7fff
208+
; COMMON-NEXT: dup v0.2d, x8
209+
; COMMON-NEXT: ret
210+
ret <2 x i64> splat (i64 -32769)
211+
}
212+
213+
define <2 x i64> @movi_257_v2i64() {
214+
; COMMON-LABEL: movi_257_v2i64:
215+
; COMMON: // %bb.0:
216+
; COMMON-NEXT: mov w8, #257 // =0x101
217+
; COMMON-NEXT: dup v0.2d, x8
218+
; COMMON-NEXT: ret
219+
ret <2 x i64> splat (i64 257)
220+
}
221+
222+
define <4 x i32> @movi_v4i32_3() {
223+
; COMMON-LABEL: movi_v4i32_3:
224+
; COMMON: // %bb.0:
225+
; COMMON-NEXT: adrp x8, .LCPI17_0
226+
; COMMON-NEXT: ldr q0, [x8, :lo12:.LCPI17_0]
227+
; COMMON-NEXT: ret
228+
ret <4 x i32> <i32 -128, i32 0, i32 -128, i32 0>
229+
}
230+
231+
define <16 x i8> @movi_v16i8_3() {
232+
; COMMON-LABEL: movi_v16i8_3:
233+
; COMMON: // %bb.0:
234+
; COMMON-NEXT: adrp x8, .LCPI18_0
235+
; COMMON-NEXT: ldr q0, [x8, :lo12:.LCPI18_0]
236+
; COMMON-NEXT: ret
237+
ret <16 x i8> <i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 127, i8 0, i8 0, i8 0, i8 0, i8 0>
238+
}

0 commit comments

Comments
 (0)