Skip to content

Commit c2506b5

Browse files
committed
[AArch64][SVE] Generate asrd instruction for positive pow-2 divisors when SVE is available
Currently, sdiv(x, y) --> cmlt + usra + sshr , where `y` is positive pow-2 integer Patch aims to transform this into sdiv(x, y) --> ptrue + asrd , where `y` is positive pow-2 integer
1 parent 45a3056 commit c2506b5

File tree

2 files changed

+37
-58
lines changed

2 files changed

+37
-58
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18421,6 +18421,11 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
1842118421

1842218422
EVT VT = N->getValueType(0);
1842318423

18424+
// For negative divisor, this yeilds (ptrue + asrd + subr) which is not
18425+
// profitable as compared to Neon sequence (cmlt + usra + sshr).
18426+
if (Subtarget->hasSVE() && !Divisor.isNegatedPowerOf2())
18427+
return SDValue(N, 0);
18428+
1842418429
// For scalable and fixed types, mark them as cheap so we can handle it much
1842518430
// later. This allows us to handle larger than legal types.
1842618431
if (VT.isScalableVector() ||

llvm/test/CodeGen/AArch64/sve-fixed-length-sdiv-pow2.ll

Lines changed: 32 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,10 @@ define <2 x i32> @sdiv_v2i32_negative_pow2_divisor_unpacked(<2 x i32> %op1) vsca
3333
define <4 x i32> @sdiv_v4i32_positive_pow2_divisor_packed(<4 x i32> %op1) vscale_range(1,0) #0 {
3434
; CHECK-LABEL: sdiv_v4i32_positive_pow2_divisor_packed:
3535
; CHECK: // %bb.0:
36-
; CHECK-NEXT: cmlt v1.4s, v0.4s, #0
37-
; CHECK-NEXT: usra v0.4s, v1.4s, #29
38-
; CHECK-NEXT: sshr v0.4s, v0.4s, #3
36+
; CHECK-NEXT: ptrue p0.s, vl4
37+
; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
38+
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #3
39+
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
3940
; CHECK-NEXT: ret
4041
%res = sdiv <4 x i32> %op1, splat (i32 8)
4142
ret <4 x i32> %res
@@ -44,9 +45,10 @@ define <4 x i32> @sdiv_v4i32_positive_pow2_divisor_packed(<4 x i32> %op1) vscale
4445
define <2 x i32> @sdiv_v2i32_positive_pow2_divisor_unpacked(<2 x i32> %op1) vscale_range(1,0) #0 {
4546
; CHECK-LABEL: sdiv_v2i32_positive_pow2_divisor_unpacked:
4647
; CHECK: // %bb.0:
47-
; CHECK-NEXT: cmlt v1.2s, v0.2s, #0
48-
; CHECK-NEXT: usra v0.2s, v1.2s, #29
49-
; CHECK-NEXT: sshr v0.2s, v0.2s, #3
48+
; CHECK-NEXT: ptrue p0.s, vl2
49+
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
50+
; CHECK-NEXT: asrd z0.s, p0/m, z0.s, #3
51+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
5052
; CHECK-NEXT: ret
5153
%res = sdiv <2 x i32> %op1, splat (i32 8)
5254
ret <2 x i32> %res
@@ -95,19 +97,12 @@ define void @sdiv_v64i8(ptr %a) #0 {
9597
; VBITS_GE_128-LABEL: sdiv_v64i8:
9698
; VBITS_GE_128: // %bb.0:
9799
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
98-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
99-
; VBITS_GE_128-NEXT: cmlt v2.16b, v0.16b, #0
100-
; VBITS_GE_128-NEXT: cmlt v5.16b, v1.16b, #0
101-
; VBITS_GE_128-NEXT: cmlt v6.16b, v3.16b, #0
102-
; VBITS_GE_128-NEXT: usra v0.16b, v2.16b, #3
103-
; VBITS_GE_128-NEXT: cmlt v2.16b, v4.16b, #0
104-
; VBITS_GE_128-NEXT: usra v1.16b, v5.16b, #3
105-
; VBITS_GE_128-NEXT: usra v3.16b, v6.16b, #3
106-
; VBITS_GE_128-NEXT: usra v4.16b, v2.16b, #3
107-
; VBITS_GE_128-NEXT: sshr v0.16b, v0.16b, #5
108-
; VBITS_GE_128-NEXT: sshr v1.16b, v1.16b, #5
109-
; VBITS_GE_128-NEXT: sshr v2.16b, v3.16b, #5
110-
; VBITS_GE_128-NEXT: sshr v3.16b, v4.16b, #5
100+
; VBITS_GE_128-NEXT: ptrue p0.b, vl16
101+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
102+
; VBITS_GE_128-NEXT: asrd z0.b, p0/m, z0.b, #5
103+
; VBITS_GE_128-NEXT: asrd z1.b, p0/m, z1.b, #5
104+
; VBITS_GE_128-NEXT: asrd z2.b, p0/m, z2.b, #5
105+
; VBITS_GE_128-NEXT: asrd z3.b, p0/m, z3.b, #5
111106
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
112107
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
113108
; VBITS_GE_128-NEXT: ret
@@ -209,19 +204,12 @@ define void @sdiv_v32i16(ptr %a) #0 {
209204
; VBITS_GE_128-LABEL: sdiv_v32i16:
210205
; VBITS_GE_128: // %bb.0:
211206
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
212-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
213-
; VBITS_GE_128-NEXT: cmlt v2.8h, v0.8h, #0
214-
; VBITS_GE_128-NEXT: cmlt v5.8h, v1.8h, #0
215-
; VBITS_GE_128-NEXT: cmlt v6.8h, v3.8h, #0
216-
; VBITS_GE_128-NEXT: usra v0.8h, v2.8h, #11
217-
; VBITS_GE_128-NEXT: cmlt v2.8h, v4.8h, #0
218-
; VBITS_GE_128-NEXT: usra v1.8h, v5.8h, #11
219-
; VBITS_GE_128-NEXT: usra v3.8h, v6.8h, #11
220-
; VBITS_GE_128-NEXT: usra v4.8h, v2.8h, #11
221-
; VBITS_GE_128-NEXT: sshr v0.8h, v0.8h, #5
222-
; VBITS_GE_128-NEXT: sshr v1.8h, v1.8h, #5
223-
; VBITS_GE_128-NEXT: sshr v2.8h, v3.8h, #5
224-
; VBITS_GE_128-NEXT: sshr v3.8h, v4.8h, #5
207+
; VBITS_GE_128-NEXT: ptrue p0.h, vl8
208+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
209+
; VBITS_GE_128-NEXT: asrd z0.h, p0/m, z0.h, #5
210+
; VBITS_GE_128-NEXT: asrd z1.h, p0/m, z1.h, #5
211+
; VBITS_GE_128-NEXT: asrd z2.h, p0/m, z2.h, #5
212+
; VBITS_GE_128-NEXT: asrd z3.h, p0/m, z3.h, #5
225213
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
226214
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
227215
; VBITS_GE_128-NEXT: ret
@@ -324,19 +312,12 @@ define void @sdiv_v16i32(ptr %a) #0 {
324312
; VBITS_GE_128-LABEL: sdiv_v16i32:
325313
; VBITS_GE_128: // %bb.0:
326314
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
327-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
328-
; VBITS_GE_128-NEXT: cmlt v2.4s, v0.4s, #0
329-
; VBITS_GE_128-NEXT: cmlt v5.4s, v1.4s, #0
330-
; VBITS_GE_128-NEXT: cmlt v6.4s, v3.4s, #0
331-
; VBITS_GE_128-NEXT: usra v0.4s, v2.4s, #27
332-
; VBITS_GE_128-NEXT: cmlt v2.4s, v4.4s, #0
333-
; VBITS_GE_128-NEXT: usra v1.4s, v5.4s, #27
334-
; VBITS_GE_128-NEXT: usra v3.4s, v6.4s, #27
335-
; VBITS_GE_128-NEXT: usra v4.4s, v2.4s, #27
336-
; VBITS_GE_128-NEXT: sshr v0.4s, v0.4s, #5
337-
; VBITS_GE_128-NEXT: sshr v1.4s, v1.4s, #5
338-
; VBITS_GE_128-NEXT: sshr v2.4s, v3.4s, #5
339-
; VBITS_GE_128-NEXT: sshr v3.4s, v4.4s, #5
315+
; VBITS_GE_128-NEXT: ptrue p0.s, vl4
316+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
317+
; VBITS_GE_128-NEXT: asrd z0.s, p0/m, z0.s, #5
318+
; VBITS_GE_128-NEXT: asrd z1.s, p0/m, z1.s, #5
319+
; VBITS_GE_128-NEXT: asrd z2.s, p0/m, z2.s, #5
320+
; VBITS_GE_128-NEXT: asrd z3.s, p0/m, z3.s, #5
340321
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
341322
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
342323
; VBITS_GE_128-NEXT: ret
@@ -439,19 +420,12 @@ define void @sdiv_v8i64(ptr %a) #0 {
439420
; VBITS_GE_128-LABEL: sdiv_v8i64:
440421
; VBITS_GE_128: // %bb.0:
441422
; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
442-
; VBITS_GE_128-NEXT: ldp q3, q4, [x0]
443-
; VBITS_GE_128-NEXT: cmlt v2.2d, v0.2d, #0
444-
; VBITS_GE_128-NEXT: cmlt v5.2d, v1.2d, #0
445-
; VBITS_GE_128-NEXT: cmlt v6.2d, v3.2d, #0
446-
; VBITS_GE_128-NEXT: usra v0.2d, v2.2d, #59
447-
; VBITS_GE_128-NEXT: cmlt v2.2d, v4.2d, #0
448-
; VBITS_GE_128-NEXT: usra v1.2d, v5.2d, #59
449-
; VBITS_GE_128-NEXT: usra v3.2d, v6.2d, #59
450-
; VBITS_GE_128-NEXT: usra v4.2d, v2.2d, #59
451-
; VBITS_GE_128-NEXT: sshr v0.2d, v0.2d, #5
452-
; VBITS_GE_128-NEXT: sshr v1.2d, v1.2d, #5
453-
; VBITS_GE_128-NEXT: sshr v2.2d, v3.2d, #5
454-
; VBITS_GE_128-NEXT: sshr v3.2d, v4.2d, #5
423+
; VBITS_GE_128-NEXT: ptrue p0.d, vl2
424+
; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
425+
; VBITS_GE_128-NEXT: asrd z0.d, p0/m, z0.d, #5
426+
; VBITS_GE_128-NEXT: asrd z1.d, p0/m, z1.d, #5
427+
; VBITS_GE_128-NEXT: asrd z2.d, p0/m, z2.d, #5
428+
; VBITS_GE_128-NEXT: asrd z3.d, p0/m, z3.d, #5
455429
; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
456430
; VBITS_GE_128-NEXT: stp q2, q3, [x0]
457431
; VBITS_GE_128-NEXT: ret

0 commit comments

Comments
 (0)