Skip to content

Commit 275dc7d

Browse files
hazzlimgithub-actions[bot]
authored andcommitted
Automerge: [AArch64] Transform add(x, abs(y)) -> saba(x, y, 0) (#156615)
Add a DAGCombine to perform the following transformations: - add(x, abs(y)) -> saba(x, y, 0) - add(x, zext(abs(y))) -> sabal(x, y, 0) As well as being a useful generic transformation, this also fixes an issue where LLVM de-optimises [US]ABA neon ACLE intrinsics into separate ABD+ADD instructions when one of the operands is a zero vector.
2 parents b7281ce + e38392b commit 275dc7d

File tree

3 files changed

+309
-85
lines changed

3 files changed

+309
-85
lines changed

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8323,6 +8323,29 @@ def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
83238323
(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
83248324
}
83258325

8326+
// SABA patterns for add(x, abs(y)) -> saba(x, y, 0)
8327+
def : Pat<(v8i8 (add V64:$Vn, (abs V64:$Vm))),
8328+
(SABAv8i8 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
8329+
def : Pat<(v4i16 (add V64:$Vn, (abs V64:$Vm))),
8330+
(SABAv4i16 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
8331+
def : Pat<(v2i32 (add V64:$Vn, (abs V64:$Vm))),
8332+
(SABAv2i32 V64:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
8333+
def : Pat<(v16i8 (add V128:$Vn, (abs V128:$Vm))),
8334+
(SABAv16i8 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
8335+
def : Pat<(v8i16 (add V128:$Vn, (abs V128:$Vm))),
8336+
(SABAv8i16 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
8337+
def : Pat<(v4i32 (add V128:$Vn, (abs V128:$Vm))),
8338+
(SABAv4i32 V128:$Vn, V128:$Vm, (MOVIv2d_ns (i32 0)))>;
8339+
8340+
// SABAL patterns for add(x, zext(abs(y))) -> sabal(x, y, 0)
8341+
def : Pat<(v8i16 (add V128:$Vn, (zext (abs (v8i8 V64:$Vm))))),
8342+
(SABALv8i8_v8i16 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
8343+
def : Pat<(v4i32 (add V128:$Vn, (zext (abs (v4i16 V64:$Vm))))),
8344+
(SABALv4i16_v4i32 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
8345+
def : Pat<(v2i64 (add V128:$Vn, (zext (abs (v2i32 V64:$Vm))))),
8346+
(SABALv2i32_v2i64 V128:$Vn, V64:$Vm, (EXTRACT_SUBREG (MOVIv2d_ns (i32 0)), dsub))>;
8347+
8348+
83268349
//----------------------------------------------------------------------------
83278350
// AdvSIMD indexed element
83288351
//----------------------------------------------------------------------------

llvm/test/CodeGen/AArch64/neon-saba.ll

Lines changed: 220 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@ define <4 x i32> @saba_abs_4s(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) #0 {
1212
;
1313
; CHECK-GI-LABEL: saba_abs_4s:
1414
; CHECK-GI: // %bb.0:
15+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
1516
; CHECK-GI-NEXT: sub v1.4s, v1.4s, v2.4s
16-
; CHECK-GI-NEXT: abs v1.4s, v1.4s
17-
; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s
17+
; CHECK-GI-NEXT: saba v0.4s, v1.4s, v3.4s
1818
; CHECK-GI-NEXT: ret
1919
%sub = sub nsw <4 x i32> %b, %c
2020
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true)
@@ -30,9 +30,9 @@ define <2 x i32> @saba_abs_2s(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) #0 {
3030
;
3131
; CHECK-GI-LABEL: saba_abs_2s:
3232
; CHECK-GI: // %bb.0:
33+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
3334
; CHECK-GI-NEXT: sub v1.2s, v1.2s, v2.2s
34-
; CHECK-GI-NEXT: abs v1.2s, v1.2s
35-
; CHECK-GI-NEXT: add v0.2s, v0.2s, v1.2s
35+
; CHECK-GI-NEXT: saba v0.2s, v1.2s, v3.2s
3636
; CHECK-GI-NEXT: ret
3737
%sub = sub nsw <2 x i32> %b, %c
3838
%abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true)
@@ -48,9 +48,9 @@ define <8 x i16> @saba_abs_8h(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) #0 {
4848
;
4949
; CHECK-GI-LABEL: saba_abs_8h:
5050
; CHECK-GI: // %bb.0:
51+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
5152
; CHECK-GI-NEXT: sub v1.8h, v1.8h, v2.8h
52-
; CHECK-GI-NEXT: abs v1.8h, v1.8h
53-
; CHECK-GI-NEXT: add v0.8h, v0.8h, v1.8h
53+
; CHECK-GI-NEXT: saba v0.8h, v1.8h, v3.8h
5454
; CHECK-GI-NEXT: ret
5555
%sub = sub nsw <8 x i16> %b, %c
5656
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true)
@@ -66,9 +66,9 @@ define <4 x i16> @saba_abs_4h(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) #0 {
6666
;
6767
; CHECK-GI-LABEL: saba_abs_4h:
6868
; CHECK-GI: // %bb.0:
69+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
6970
; CHECK-GI-NEXT: sub v1.4h, v1.4h, v2.4h
70-
; CHECK-GI-NEXT: abs v1.4h, v1.4h
71-
; CHECK-GI-NEXT: add v0.4h, v0.4h, v1.4h
71+
; CHECK-GI-NEXT: saba v0.4h, v1.4h, v3.4h
7272
; CHECK-GI-NEXT: ret
7373
%sub = sub nsw <4 x i16> %b, %c
7474
%abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true)
@@ -84,9 +84,9 @@ define <16 x i8> @saba_abs_16b(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) #0 {
8484
;
8585
; CHECK-GI-LABEL: saba_abs_16b:
8686
; CHECK-GI: // %bb.0:
87+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
8788
; CHECK-GI-NEXT: sub v1.16b, v1.16b, v2.16b
88-
; CHECK-GI-NEXT: abs v1.16b, v1.16b
89-
; CHECK-GI-NEXT: add v0.16b, v0.16b, v1.16b
89+
; CHECK-GI-NEXT: saba v0.16b, v1.16b, v3.16b
9090
; CHECK-GI-NEXT: ret
9191
%sub = sub nsw <16 x i8> %b, %c
9292
%abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true)
@@ -102,9 +102,9 @@ define <8 x i8> @saba_abs_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
102102
;
103103
; CHECK-GI-LABEL: saba_abs_8b:
104104
; CHECK-GI: // %bb.0:
105+
; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
105106
; CHECK-GI-NEXT: sub v1.8b, v1.8b, v2.8b
106-
; CHECK-GI-NEXT: abs v1.8b, v1.8b
107-
; CHECK-GI-NEXT: add v0.8b, v0.8b, v1.8b
107+
; CHECK-GI-NEXT: saba v0.8b, v1.8b, v3.8b
108108
; CHECK-GI-NEXT: ret
109109
%sub = sub nsw <8 x i8> %b, %c
110110
%abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %sub, i1 true)
@@ -174,6 +174,214 @@ define <8 x i8> @saba_sabd_8b(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) #0 {
174174
ret <8 x i8> %add
175175
}
176176

177+
; SABA from ADD(SABD(X, ZEROS))
178+
179+
define <4 x i32> @saba_sabd_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 {
180+
; CHECK-LABEL: saba_sabd_zeros_4s:
181+
; CHECK: // %bb.0:
182+
; CHECK-NEXT: movi v2.2d, #0000000000000000
183+
; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s
184+
; CHECK-NEXT: ret
185+
%sabd = call <4 x i32> @llvm.aarch64.neon.sabd.v4i32(<4 x i32> %b, <4 x i32> zeroinitializer)
186+
%add = add <4 x i32> %sabd, %a
187+
ret <4 x i32> %add
188+
}
189+
190+
define <2 x i32> @saba_sabd_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 {
191+
; CHECK-LABEL: saba_sabd_zeros_2s:
192+
; CHECK: // %bb.0:
193+
; CHECK-NEXT: movi v2.2d, #0000000000000000
194+
; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s
195+
; CHECK-NEXT: ret
196+
%sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer)
197+
%add = add <2 x i32> %sabd, %a
198+
ret <2 x i32> %add
199+
}
200+
201+
define <8 x i16> @saba_sabd_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 {
202+
; CHECK-LABEL: saba_sabd_zeros_8h:
203+
; CHECK: // %bb.0:
204+
; CHECK-NEXT: movi v2.2d, #0000000000000000
205+
; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h
206+
; CHECK-NEXT: ret
207+
%sabd = call <8 x i16> @llvm.aarch64.neon.sabd.v8i16(<8 x i16> %b, <8 x i16> zeroinitializer)
208+
%add = add <8 x i16> %sabd, %a
209+
ret <8 x i16> %add
210+
}
211+
212+
define <4 x i16> @saba_sabd_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 {
213+
; CHECK-LABEL: saba_sabd_zeros_4h:
214+
; CHECK: // %bb.0:
215+
; CHECK-NEXT: movi v2.2d, #0000000000000000
216+
; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h
217+
; CHECK-NEXT: ret
218+
%sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer)
219+
%add = add <4 x i16> %sabd, %a
220+
ret <4 x i16> %add
221+
}
222+
223+
define <16 x i8> @saba_sabd_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 {
224+
; CHECK-LABEL: saba_sabd_zeros_16b:
225+
; CHECK: // %bb.0:
226+
; CHECK-NEXT: movi v2.2d, #0000000000000000
227+
; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b
228+
; CHECK-NEXT: ret
229+
%sabd = call <16 x i8> @llvm.aarch64.neon.sabd.v16i8(<16 x i8> %b, <16 x i8> zeroinitializer)
230+
%add = add <16 x i8> %sabd, %a
231+
ret <16 x i8> %add
232+
}
233+
234+
define <8 x i8> @saba_sabd_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 {
235+
; CHECK-LABEL: saba_sabd_zeros_8b:
236+
; CHECK: // %bb.0:
237+
; CHECK-NEXT: movi v2.2d, #0000000000000000
238+
; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b
239+
; CHECK-NEXT: ret
240+
%sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer)
241+
%add = add <8 x i8> %sabd, %a
242+
ret <8 x i8> %add
243+
}
244+
245+
define <4 x i32> @saba_abs_zeros_4s(<4 x i32> %a, <4 x i32> %b) #0 {
246+
; CHECK-LABEL: saba_abs_zeros_4s:
247+
; CHECK: // %bb.0:
248+
; CHECK-NEXT: movi v2.2d, #0000000000000000
249+
; CHECK-NEXT: saba v0.4s, v1.4s, v2.4s
250+
; CHECK-NEXT: ret
251+
%abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %b, i1 true)
252+
%add = add <4 x i32> %a, %abs
253+
ret <4 x i32> %add
254+
}
255+
256+
define <2 x i32> @saba_abs_zeros_2s(<2 x i32> %a, <2 x i32> %b) #0 {
257+
; CHECK-LABEL: saba_abs_zeros_2s:
258+
; CHECK: // %bb.0:
259+
; CHECK-NEXT: movi v2.2d, #0000000000000000
260+
; CHECK-NEXT: saba v0.2s, v1.2s, v2.2s
261+
; CHECK-NEXT: ret
262+
%abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true)
263+
%add = add <2 x i32> %a, %abs
264+
ret <2 x i32> %add
265+
}
266+
267+
define <8 x i16> @saba_abs_zeros_8h(<8 x i16> %a, <8 x i16> %b) #0 {
268+
; CHECK-LABEL: saba_abs_zeros_8h:
269+
; CHECK: // %bb.0:
270+
; CHECK-NEXT: movi v2.2d, #0000000000000000
271+
; CHECK-NEXT: saba v0.8h, v1.8h, v2.8h
272+
; CHECK-NEXT: ret
273+
%abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %b, i1 true)
274+
%add = add <8 x i16> %a, %abs
275+
ret <8 x i16> %add
276+
}
277+
278+
define <4 x i16> @saba_abs_zeros_4h(<4 x i16> %a, <4 x i16> %b) #0 {
279+
; CHECK-LABEL: saba_abs_zeros_4h:
280+
; CHECK: // %bb.0:
281+
; CHECK-NEXT: movi v2.2d, #0000000000000000
282+
; CHECK-NEXT: saba v0.4h, v1.4h, v2.4h
283+
; CHECK-NEXT: ret
284+
%abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true)
285+
%add = add <4 x i16> %a, %abs
286+
ret <4 x i16> %add
287+
}
288+
289+
define <16 x i8> @saba_abs_zeros_16b(<16 x i8> %a, <16 x i8> %b) #0 {
290+
; CHECK-LABEL: saba_abs_zeros_16b:
291+
; CHECK: // %bb.0:
292+
; CHECK-NEXT: movi v2.2d, #0000000000000000
293+
; CHECK-NEXT: saba v0.16b, v1.16b, v2.16b
294+
; CHECK-NEXT: ret
295+
%abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %b, i1 true)
296+
%add = add <16 x i8> %a, %abs
297+
ret <16 x i8> %add
298+
}
299+
300+
define <8 x i8> @saba_abs_zeros_8b(<8 x i8> %a, <8 x i8> %b) #0 {
301+
; CHECK-LABEL: saba_abs_zeros_8b:
302+
; CHECK: // %bb.0:
303+
; CHECK-NEXT: movi v2.2d, #0000000000000000
304+
; CHECK-NEXT: saba v0.8b, v1.8b, v2.8b
305+
; CHECK-NEXT: ret
306+
%abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true)
307+
%add = add <8 x i8> %a, %abs
308+
ret <8 x i8> %add
309+
}
310+
311+
; SABAL from ADD(ZEXT(SABD(X, ZEROS)))
312+
313+
define <2 x i64> @sabal_sabd_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 {
314+
; CHECK-LABEL: sabal_sabd_zeros_2s:
315+
; CHECK: // %bb.0:
316+
; CHECK-NEXT: movi v2.2d, #0000000000000000
317+
; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s
318+
; CHECK-NEXT: ret
319+
%sabd = call <2 x i32> @llvm.aarch64.neon.sabd.v2i32(<2 x i32> %b, <2 x i32> zeroinitializer)
320+
%sabd.zext = zext <2 x i32> %sabd to <2 x i64>
321+
%add = add <2 x i64> %sabd.zext, %a
322+
ret <2 x i64> %add
323+
}
324+
325+
define <4 x i32> @sabal_sabd_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 {
326+
; CHECK-LABEL: sabal_sabd_zeros_4h:
327+
; CHECK: // %bb.0:
328+
; CHECK-NEXT: movi v2.2d, #0000000000000000
329+
; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h
330+
; CHECK-NEXT: ret
331+
%sabd = call <4 x i16> @llvm.aarch64.neon.sabd.v4i16(<4 x i16> %b, <4 x i16> zeroinitializer)
332+
%sabd.zext = zext <4 x i16> %sabd to <4 x i32>
333+
%add = add <4 x i32> %sabd.zext, %a
334+
ret <4 x i32> %add
335+
}
336+
337+
define <8 x i16> @sabal_sabd_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 {
338+
; CHECK-LABEL: sabal_sabd_zeros_8b:
339+
; CHECK: // %bb.0:
340+
; CHECK-NEXT: movi v2.2d, #0000000000000000
341+
; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b
342+
; CHECK-NEXT: ret
343+
%sabd = call <8 x i8> @llvm.aarch64.neon.sabd.v8i8(<8 x i8> %b, <8 x i8> zeroinitializer)
344+
%sabd.zext = zext <8 x i8> %sabd to <8 x i16>
345+
%add = add <8 x i16> %sabd.zext, %a
346+
ret <8 x i16> %add
347+
}
348+
349+
define <2 x i64> @sabal_abs_zeros_2s(<2 x i64> %a, <2 x i32> %b) #0 {
350+
; CHECK-LABEL: sabal_abs_zeros_2s:
351+
; CHECK: // %bb.0:
352+
; CHECK-NEXT: movi v2.2d, #0000000000000000
353+
; CHECK-NEXT: sabal v0.2d, v1.2s, v2.2s
354+
; CHECK-NEXT: ret
355+
%abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %b, i1 true)
356+
%abs.zext = zext <2 x i32> %abs to <2 x i64>
357+
%add = add <2 x i64> %a, %abs.zext
358+
ret <2 x i64> %add
359+
}
360+
361+
define <4 x i32> @sabal_abs_zeros_4h(<4 x i32> %a, <4 x i16> %b) #0 {
362+
; CHECK-LABEL: sabal_abs_zeros_4h:
363+
; CHECK: // %bb.0:
364+
; CHECK-NEXT: movi v2.2d, #0000000000000000
365+
; CHECK-NEXT: sabal v0.4s, v1.4h, v2.4h
366+
; CHECK-NEXT: ret
367+
%abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %b, i1 true)
368+
%abs.zext = zext <4 x i16> %abs to <4 x i32>
369+
%add = add <4 x i32> %a, %abs.zext
370+
ret <4 x i32> %add
371+
}
372+
373+
define <8 x i16> @sabal_abs_zeros_8b(<8 x i16> %a, <8 x i8> %b) #0 {
374+
; CHECK-LABEL: sabal_abs_zeros_8b:
375+
; CHECK: // %bb.0:
376+
; CHECK-NEXT: movi v2.2d, #0000000000000000
377+
; CHECK-NEXT: sabal v0.8h, v1.8b, v2.8b
378+
; CHECK-NEXT: ret
379+
%abs = call <8 x i8> @llvm.abs.v8i8(<8 x i8> %b, i1 true)
380+
%abs.zext = zext <8 x i8> %abs to <8 x i16>
381+
%add = add <8 x i16> %a, %abs.zext
382+
ret <8 x i16> %add
383+
}
384+
177385
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
178386
declare <2 x i32> @llvm.abs.v2i32(<2 x i32>, i1)
179387
declare <8 x i16> @llvm.abs.v8i16(<8 x i16>, i1)

0 commit comments

Comments
 (0)