Skip to content

Commit ed3f775

Browse files
[AArch64][NFC] Add test for vector udiv scalarization
1 parent 045331e commit ed3f775

File tree

1 file changed

+223
-0
lines changed

1 file changed

+223
-0
lines changed
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
3+
4+
; This test verifies that udiv by constant works correctly even when type
5+
; legalization promotes constant operands (e.g., i16 -> i32 in BUILD_VECTOR).
6+
; This is a regression test for a bug where v16i16 would be split into two
7+
; v8i16 operations during legalization, the i16 constants would be promoted
8+
; to i32, and then the second DAGCombine round would fail to recognize the
9+
; promoted constants when trying to convert udiv into mul+shift.
10+
11+
define <8 x i16> @udiv_v8i16_by_255(<8 x i16> %x) {
12+
; CHECK-LABEL: udiv_v8i16_by_255:
13+
; CHECK: // %bb.0:
14+
; CHECK-NEXT: mov w8, #32897 // =0x8081
15+
; CHECK-NEXT: dup v1.8h, w8
16+
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
17+
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
18+
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h
19+
; CHECK-NEXT: ushr v0.8h, v0.8h, #7
20+
; CHECK-NEXT: ret
21+
%div = udiv <8 x i16> %x, splat (i16 255)
22+
ret <8 x i16> %div
23+
}
24+
25+
define <16 x i16> @udiv_v16i16_by_255(<16 x i16> %x) {
26+
; CHECK-LABEL: udiv_v16i16_by_255:
27+
; CHECK: // %bb.0:
28+
; CHECK-NEXT: umov w9, v0.h[0]
29+
; CHECK-NEXT: umov w11, v1.h[0]
30+
; CHECK-NEXT: mov w8, #258 // =0x102
31+
; CHECK-NEXT: movk w8, #257, lsl #16
32+
; CHECK-NEXT: umov w10, v0.h[1]
33+
; CHECK-NEXT: umov w12, v1.h[1]
34+
; CHECK-NEXT: umov w13, v0.h[2]
35+
; CHECK-NEXT: umov w14, v1.h[2]
36+
; CHECK-NEXT: umull x9, w9, w8
37+
; CHECK-NEXT: umull x11, w11, w8
38+
; CHECK-NEXT: umull x10, w10, w8
39+
; CHECK-NEXT: umull x12, w12, w8
40+
; CHECK-NEXT: lsr x9, x9, #32
41+
; CHECK-NEXT: lsr x11, x11, #32
42+
; CHECK-NEXT: umull x13, w13, w8
43+
; CHECK-NEXT: fmov s2, w9
44+
; CHECK-NEXT: lsr x10, x10, #32
45+
; CHECK-NEXT: umov w9, v0.h[3]
46+
; CHECK-NEXT: fmov s3, w11
47+
; CHECK-NEXT: lsr x12, x12, #32
48+
; CHECK-NEXT: umull x11, w14, w8
49+
; CHECK-NEXT: umov w14, v1.h[3]
50+
; CHECK-NEXT: mov v2.h[1], w10
51+
; CHECK-NEXT: lsr x10, x13, #32
52+
; CHECK-NEXT: mov v3.h[1], w12
53+
; CHECK-NEXT: umov w12, v0.h[4]
54+
; CHECK-NEXT: lsr x11, x11, #32
55+
; CHECK-NEXT: umull x9, w9, w8
56+
; CHECK-NEXT: umull x13, w14, w8
57+
; CHECK-NEXT: umov w14, v1.h[4]
58+
; CHECK-NEXT: mov v2.h[2], w10
59+
; CHECK-NEXT: mov v3.h[2], w11
60+
; CHECK-NEXT: lsr x9, x9, #32
61+
; CHECK-NEXT: umull x10, w12, w8
62+
; CHECK-NEXT: lsr x12, x13, #32
63+
; CHECK-NEXT: umov w11, v0.h[5]
64+
; CHECK-NEXT: umull x13, w14, w8
65+
; CHECK-NEXT: umov w14, v1.h[5]
66+
; CHECK-NEXT: mov v2.h[3], w9
67+
; CHECK-NEXT: lsr x9, x10, #32
68+
; CHECK-NEXT: mov v3.h[3], w12
69+
; CHECK-NEXT: lsr x12, x13, #32
70+
; CHECK-NEXT: umull x10, w11, w8
71+
; CHECK-NEXT: umov w11, v0.h[6]
72+
; CHECK-NEXT: umull x13, w14, w8
73+
; CHECK-NEXT: umov w14, v1.h[6]
74+
; CHECK-NEXT: mov v2.h[4], w9
75+
; CHECK-NEXT: umov w9, v0.h[7]
76+
; CHECK-NEXT: mov v3.h[4], w12
77+
; CHECK-NEXT: lsr x10, x10, #32
78+
; CHECK-NEXT: lsr x12, x13, #32
79+
; CHECK-NEXT: umull x11, w11, w8
80+
; CHECK-NEXT: umull x13, w14, w8
81+
; CHECK-NEXT: umov w14, v1.h[7]
82+
; CHECK-NEXT: mov v2.h[5], w10
83+
; CHECK-NEXT: umull x9, w9, w8
84+
; CHECK-NEXT: mov v3.h[5], w12
85+
; CHECK-NEXT: lsr x10, x11, #32
86+
; CHECK-NEXT: lsr x11, x13, #32
87+
; CHECK-NEXT: umull x8, w14, w8
88+
; CHECK-NEXT: lsr x9, x9, #32
89+
; CHECK-NEXT: mov v2.h[6], w10
90+
; CHECK-NEXT: mov v3.h[6], w11
91+
; CHECK-NEXT: lsr x8, x8, #32
92+
; CHECK-NEXT: mov v2.h[7], w9
93+
; CHECK-NEXT: mov v3.h[7], w8
94+
; CHECK-NEXT: mov v0.16b, v2.16b
95+
; CHECK-NEXT: mov v1.16b, v3.16b
96+
; CHECK-NEXT: ret
97+
%div = udiv <16 x i16> %x, splat (i16 255)
98+
ret <16 x i16> %div
99+
}
100+
101+
define <8 x i16> @urem_v8i16_by_255(<8 x i16> %x) {
102+
; CHECK-LABEL: urem_v8i16_by_255:
103+
; CHECK: // %bb.0:
104+
; CHECK-NEXT: mov w8, #32897 // =0x8081
105+
; CHECK-NEXT: dup v1.8h, w8
106+
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h
107+
; CHECK-NEXT: umull v1.4s, v0.4h, v1.4h
108+
; CHECK-NEXT: uzp2 v1.8h, v1.8h, v2.8h
109+
; CHECK-NEXT: movi v2.2d, #0xff00ff00ff00ff
110+
; CHECK-NEXT: ushr v1.8h, v1.8h, #7
111+
; CHECK-NEXT: mls v0.8h, v1.8h, v2.8h
112+
; CHECK-NEXT: ret
113+
%rem = urem <8 x i16> %x, splat (i16 255)
114+
ret <8 x i16> %rem
115+
}
116+
117+
define <16 x i16> @urem_v16i16_by_255(<16 x i16> %x) {
118+
; CHECK-LABEL: urem_v16i16_by_255:
119+
; CHECK: // %bb.0:
120+
; CHECK-NEXT: umov w9, v0.h[1]
121+
; CHECK-NEXT: umov w10, v0.h[0]
122+
; CHECK-NEXT: mov w8, #258 // =0x102
123+
; CHECK-NEXT: umov w12, v1.h[0]
124+
; CHECK-NEXT: movk w8, #257, lsl #16
125+
; CHECK-NEXT: umov w11, v1.h[1]
126+
; CHECK-NEXT: umov w17, v0.h[2]
127+
; CHECK-NEXT: umov w18, v1.h[2]
128+
; CHECK-NEXT: umov w0, v0.h[3]
129+
; CHECK-NEXT: umov w1, v1.h[3]
130+
; CHECK-NEXT: umull x13, w9, w8
131+
; CHECK-NEXT: umull x14, w10, w8
132+
; CHECK-NEXT: umull x16, w12, w8
133+
; CHECK-NEXT: umull x15, w11, w8
134+
; CHECK-NEXT: lsr x13, x13, #32
135+
; CHECK-NEXT: lsr x14, x14, #32
136+
; CHECK-NEXT: lsr x16, x16, #32
137+
; CHECK-NEXT: sub w13, w13, w13, lsl #8
138+
; CHECK-NEXT: sub w14, w14, w14, lsl #8
139+
; CHECK-NEXT: lsr x15, x15, #32
140+
; CHECK-NEXT: sub w16, w16, w16, lsl #8
141+
; CHECK-NEXT: add w9, w9, w13
142+
; CHECK-NEXT: umull x13, w17, w8
143+
; CHECK-NEXT: add w10, w10, w14
144+
; CHECK-NEXT: umull x14, w18, w8
145+
; CHECK-NEXT: sub w15, w15, w15, lsl #8
146+
; CHECK-NEXT: add w12, w12, w16
147+
; CHECK-NEXT: fmov s2, w10
148+
; CHECK-NEXT: umov w16, v1.h[4]
149+
; CHECK-NEXT: fmov s3, w12
150+
; CHECK-NEXT: add w11, w11, w15
151+
; CHECK-NEXT: lsr x13, x13, #32
152+
; CHECK-NEXT: lsr x14, x14, #32
153+
; CHECK-NEXT: umov w15, v0.h[4]
154+
; CHECK-NEXT: umull x10, w0, w8
155+
; CHECK-NEXT: umull x12, w1, w8
156+
; CHECK-NEXT: mov v2.h[1], w9
157+
; CHECK-NEXT: sub w13, w13, w13, lsl #8
158+
; CHECK-NEXT: mov v3.h[1], w11
159+
; CHECK-NEXT: sub w14, w14, w14, lsl #8
160+
; CHECK-NEXT: umov w9, v0.h[5]
161+
; CHECK-NEXT: add w13, w17, w13
162+
; CHECK-NEXT: lsr x10, x10, #32
163+
; CHECK-NEXT: umov w11, v1.h[5]
164+
; CHECK-NEXT: add w14, w18, w14
165+
; CHECK-NEXT: lsr x12, x12, #32
166+
; CHECK-NEXT: umull x17, w15, w8
167+
; CHECK-NEXT: umull x18, w16, w8
168+
; CHECK-NEXT: mov v2.h[2], w13
169+
; CHECK-NEXT: sub w10, w10, w10, lsl #8
170+
; CHECK-NEXT: mov v3.h[2], w14
171+
; CHECK-NEXT: sub w12, w12, w12, lsl #8
172+
; CHECK-NEXT: umov w13, v0.h[6]
173+
; CHECK-NEXT: lsr x14, x17, #32
174+
; CHECK-NEXT: add w10, w0, w10
175+
; CHECK-NEXT: umull x17, w9, w8
176+
; CHECK-NEXT: lsr x18, x18, #32
177+
; CHECK-NEXT: add w12, w1, w12
178+
; CHECK-NEXT: umull x0, w11, w8
179+
; CHECK-NEXT: mov v2.h[3], w10
180+
; CHECK-NEXT: umov w10, v1.h[6]
181+
; CHECK-NEXT: sub w14, w14, w14, lsl #8
182+
; CHECK-NEXT: mov v3.h[3], w12
183+
; CHECK-NEXT: sub w18, w18, w18, lsl #8
184+
; CHECK-NEXT: lsr x17, x17, #32
185+
; CHECK-NEXT: add w14, w15, w14
186+
; CHECK-NEXT: umov w12, v0.h[7]
187+
; CHECK-NEXT: add w15, w16, w18
188+
; CHECK-NEXT: lsr x18, x0, #32
189+
; CHECK-NEXT: umov w16, v1.h[7]
190+
; CHECK-NEXT: mov v2.h[4], w14
191+
; CHECK-NEXT: umull x14, w13, w8
192+
; CHECK-NEXT: sub w17, w17, w17, lsl #8
193+
; CHECK-NEXT: mov v3.h[4], w15
194+
; CHECK-NEXT: umull x15, w10, w8
195+
; CHECK-NEXT: sub w18, w18, w18, lsl #8
196+
; CHECK-NEXT: add w9, w9, w17
197+
; CHECK-NEXT: add w11, w11, w18
198+
; CHECK-NEXT: lsr x14, x14, #32
199+
; CHECK-NEXT: lsr x15, x15, #32
200+
; CHECK-NEXT: mov v2.h[5], w9
201+
; CHECK-NEXT: umull x9, w12, w8
202+
; CHECK-NEXT: mov v3.h[5], w11
203+
; CHECK-NEXT: umull x8, w16, w8
204+
; CHECK-NEXT: sub w11, w14, w14, lsl #8
205+
; CHECK-NEXT: sub w14, w15, w15, lsl #8
206+
; CHECK-NEXT: add w11, w13, w11
207+
; CHECK-NEXT: lsr x9, x9, #32
208+
; CHECK-NEXT: add w10, w10, w14
209+
; CHECK-NEXT: lsr x8, x8, #32
210+
; CHECK-NEXT: mov v2.h[6], w11
211+
; CHECK-NEXT: mov v3.h[6], w10
212+
; CHECK-NEXT: sub w9, w9, w9, lsl #8
213+
; CHECK-NEXT: sub w8, w8, w8, lsl #8
214+
; CHECK-NEXT: add w9, w12, w9
215+
; CHECK-NEXT: add w8, w16, w8
216+
; CHECK-NEXT: mov v2.h[7], w9
217+
; CHECK-NEXT: mov v3.h[7], w8
218+
; CHECK-NEXT: mov v0.16b, v2.16b
219+
; CHECK-NEXT: mov v1.16b, v3.16b
220+
; CHECK-NEXT: ret
221+
%rem = urem <16 x i16> %x, splat (i16 255)
222+
ret <16 x i16> %rem
223+
}

0 commit comments

Comments
 (0)