@@ -25,74 +25,16 @@ define <8 x i16> @udiv_v8i16_by_255(<8 x i16> %x) {
2525define <16 x i16 > @udiv_v16i16_by_255 (<16 x i16 > %x ) {
2626; CHECK-LABEL: udiv_v16i16_by_255:
2727; CHECK: // %bb.0:
28- ; CHECK-NEXT: umov w9, v0.h[0]
29- ; CHECK-NEXT: umov w11, v1.h[0]
30- ; CHECK-NEXT: mov w8, #258 // =0x102
31- ; CHECK-NEXT: movk w8, #257, lsl #16
32- ; CHECK-NEXT: umov w10, v0.h[1]
33- ; CHECK-NEXT: umov w12, v1.h[1]
34- ; CHECK-NEXT: umov w13, v0.h[2]
35- ; CHECK-NEXT: umov w14, v1.h[2]
36- ; CHECK-NEXT: umull x9, w9, w8
37- ; CHECK-NEXT: umull x11, w11, w8
38- ; CHECK-NEXT: umull x10, w10, w8
39- ; CHECK-NEXT: umull x12, w12, w8
40- ; CHECK-NEXT: lsr x9, x9, #32
41- ; CHECK-NEXT: lsr x11, x11, #32
42- ; CHECK-NEXT: umull x13, w13, w8
43- ; CHECK-NEXT: fmov s2, w9
44- ; CHECK-NEXT: lsr x10, x10, #32
45- ; CHECK-NEXT: umov w9, v0.h[3]
46- ; CHECK-NEXT: fmov s3, w11
47- ; CHECK-NEXT: lsr x12, x12, #32
48- ; CHECK-NEXT: umull x11, w14, w8
49- ; CHECK-NEXT: umov w14, v1.h[3]
50- ; CHECK-NEXT: mov v2.h[1], w10
51- ; CHECK-NEXT: lsr x10, x13, #32
52- ; CHECK-NEXT: mov v3.h[1], w12
53- ; CHECK-NEXT: umov w12, v0.h[4]
54- ; CHECK-NEXT: lsr x11, x11, #32
55- ; CHECK-NEXT: umull x9, w9, w8
56- ; CHECK-NEXT: umull x13, w14, w8
57- ; CHECK-NEXT: umov w14, v1.h[4]
58- ; CHECK-NEXT: mov v2.h[2], w10
59- ; CHECK-NEXT: mov v3.h[2], w11
60- ; CHECK-NEXT: lsr x9, x9, #32
61- ; CHECK-NEXT: umull x10, w12, w8
62- ; CHECK-NEXT: lsr x12, x13, #32
63- ; CHECK-NEXT: umov w11, v0.h[5]
64- ; CHECK-NEXT: umull x13, w14, w8
65- ; CHECK-NEXT: umov w14, v1.h[5]
66- ; CHECK-NEXT: mov v2.h[3], w9
67- ; CHECK-NEXT: lsr x9, x10, #32
68- ; CHECK-NEXT: mov v3.h[3], w12
69- ; CHECK-NEXT: lsr x12, x13, #32
70- ; CHECK-NEXT: umull x10, w11, w8
71- ; CHECK-NEXT: umov w11, v0.h[6]
72- ; CHECK-NEXT: umull x13, w14, w8
73- ; CHECK-NEXT: umov w14, v1.h[6]
74- ; CHECK-NEXT: mov v2.h[4], w9
75- ; CHECK-NEXT: umov w9, v0.h[7]
76- ; CHECK-NEXT: mov v3.h[4], w12
77- ; CHECK-NEXT: lsr x10, x10, #32
78- ; CHECK-NEXT: lsr x12, x13, #32
79- ; CHECK-NEXT: umull x11, w11, w8
80- ; CHECK-NEXT: umull x13, w14, w8
81- ; CHECK-NEXT: umov w14, v1.h[7]
82- ; CHECK-NEXT: mov v2.h[5], w10
83- ; CHECK-NEXT: umull x9, w9, w8
84- ; CHECK-NEXT: mov v3.h[5], w12
85- ; CHECK-NEXT: lsr x10, x11, #32
86- ; CHECK-NEXT: lsr x11, x13, #32
87- ; CHECK-NEXT: umull x8, w14, w8
88- ; CHECK-NEXT: lsr x9, x9, #32
89- ; CHECK-NEXT: mov v2.h[6], w10
90- ; CHECK-NEXT: mov v3.h[6], w11
91- ; CHECK-NEXT: lsr x8, x8, #32
92- ; CHECK-NEXT: mov v2.h[7], w9
93- ; CHECK-NEXT: mov v3.h[7], w8
94- ; CHECK-NEXT: mov v0.16b, v2.16b
95- ; CHECK-NEXT: mov v1.16b, v3.16b
28+ ; CHECK-NEXT: mov w8, #32897 // =0x8081
29+ ; CHECK-NEXT: dup v2.8h, w8
30+ ; CHECK-NEXT: umull2 v3.4s, v0.8h, v2.8h
31+ ; CHECK-NEXT: umull v0.4s, v0.4h, v2.4h
32+ ; CHECK-NEXT: umull2 v4.4s, v1.8h, v2.8h
33+ ; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h
34+ ; CHECK-NEXT: uzp2 v0.8h, v0.8h, v3.8h
35+ ; CHECK-NEXT: uzp2 v1.8h, v1.8h, v4.8h
36+ ; CHECK-NEXT: ushr v0.8h, v0.8h, #7
37+ ; CHECK-NEXT: ushr v1.8h, v1.8h, #7
9638; CHECK-NEXT: ret
9739 %div = udiv <16 x i16 > %x , splat (i16 255 )
9840 ret <16 x i16 > %div
@@ -117,106 +59,19 @@ define <8 x i16> @urem_v8i16_by_255(<8 x i16> %x) {
11759define <16 x i16 > @urem_v16i16_by_255 (<16 x i16 > %x ) {
11860; CHECK-LABEL: urem_v16i16_by_255:
11961; CHECK: // %bb.0:
120- ; CHECK-NEXT: umov w9, v0.h[1]
121- ; CHECK-NEXT: umov w10, v0.h[0]
122- ; CHECK-NEXT: mov w8, #258 // =0x102
123- ; CHECK-NEXT: umov w12, v1.h[0]
124- ; CHECK-NEXT: movk w8, #257, lsl #16
125- ; CHECK-NEXT: umov w11, v1.h[1]
126- ; CHECK-NEXT: umov w17, v0.h[2]
127- ; CHECK-NEXT: umov w18, v1.h[2]
128- ; CHECK-NEXT: umov w0, v0.h[3]
129- ; CHECK-NEXT: umov w1, v1.h[3]
130- ; CHECK-NEXT: umull x13, w9, w8
131- ; CHECK-NEXT: umull x14, w10, w8
132- ; CHECK-NEXT: umull x16, w12, w8
133- ; CHECK-NEXT: umull x15, w11, w8
134- ; CHECK-NEXT: lsr x13, x13, #32
135- ; CHECK-NEXT: lsr x14, x14, #32
136- ; CHECK-NEXT: lsr x16, x16, #32
137- ; CHECK-NEXT: sub w13, w13, w13, lsl #8
138- ; CHECK-NEXT: sub w14, w14, w14, lsl #8
139- ; CHECK-NEXT: lsr x15, x15, #32
140- ; CHECK-NEXT: sub w16, w16, w16, lsl #8
141- ; CHECK-NEXT: add w9, w9, w13
142- ; CHECK-NEXT: umull x13, w17, w8
143- ; CHECK-NEXT: add w10, w10, w14
144- ; CHECK-NEXT: umull x14, w18, w8
145- ; CHECK-NEXT: sub w15, w15, w15, lsl #8
146- ; CHECK-NEXT: add w12, w12, w16
147- ; CHECK-NEXT: fmov s2, w10
148- ; CHECK-NEXT: umov w16, v1.h[4]
149- ; CHECK-NEXT: fmov s3, w12
150- ; CHECK-NEXT: add w11, w11, w15
151- ; CHECK-NEXT: lsr x13, x13, #32
152- ; CHECK-NEXT: lsr x14, x14, #32
153- ; CHECK-NEXT: umov w15, v0.h[4]
154- ; CHECK-NEXT: umull x10, w0, w8
155- ; CHECK-NEXT: umull x12, w1, w8
156- ; CHECK-NEXT: mov v2.h[1], w9
157- ; CHECK-NEXT: sub w13, w13, w13, lsl #8
158- ; CHECK-NEXT: mov v3.h[1], w11
159- ; CHECK-NEXT: sub w14, w14, w14, lsl #8
160- ; CHECK-NEXT: umov w9, v0.h[5]
161- ; CHECK-NEXT: add w13, w17, w13
162- ; CHECK-NEXT: lsr x10, x10, #32
163- ; CHECK-NEXT: umov w11, v1.h[5]
164- ; CHECK-NEXT: add w14, w18, w14
165- ; CHECK-NEXT: lsr x12, x12, #32
166- ; CHECK-NEXT: umull x17, w15, w8
167- ; CHECK-NEXT: umull x18, w16, w8
168- ; CHECK-NEXT: mov v2.h[2], w13
169- ; CHECK-NEXT: sub w10, w10, w10, lsl #8
170- ; CHECK-NEXT: mov v3.h[2], w14
171- ; CHECK-NEXT: sub w12, w12, w12, lsl #8
172- ; CHECK-NEXT: umov w13, v0.h[6]
173- ; CHECK-NEXT: lsr x14, x17, #32
174- ; CHECK-NEXT: add w10, w0, w10
175- ; CHECK-NEXT: umull x17, w9, w8
176- ; CHECK-NEXT: lsr x18, x18, #32
177- ; CHECK-NEXT: add w12, w1, w12
178- ; CHECK-NEXT: umull x0, w11, w8
179- ; CHECK-NEXT: mov v2.h[3], w10
180- ; CHECK-NEXT: umov w10, v1.h[6]
181- ; CHECK-NEXT: sub w14, w14, w14, lsl #8
182- ; CHECK-NEXT: mov v3.h[3], w12
183- ; CHECK-NEXT: sub w18, w18, w18, lsl #8
184- ; CHECK-NEXT: lsr x17, x17, #32
185- ; CHECK-NEXT: add w14, w15, w14
186- ; CHECK-NEXT: umov w12, v0.h[7]
187- ; CHECK-NEXT: add w15, w16, w18
188- ; CHECK-NEXT: lsr x18, x0, #32
189- ; CHECK-NEXT: umov w16, v1.h[7]
190- ; CHECK-NEXT: mov v2.h[4], w14
191- ; CHECK-NEXT: umull x14, w13, w8
192- ; CHECK-NEXT: sub w17, w17, w17, lsl #8
193- ; CHECK-NEXT: mov v3.h[4], w15
194- ; CHECK-NEXT: umull x15, w10, w8
195- ; CHECK-NEXT: sub w18, w18, w18, lsl #8
196- ; CHECK-NEXT: add w9, w9, w17
197- ; CHECK-NEXT: add w11, w11, w18
198- ; CHECK-NEXT: lsr x14, x14, #32
199- ; CHECK-NEXT: lsr x15, x15, #32
200- ; CHECK-NEXT: mov v2.h[5], w9
201- ; CHECK-NEXT: umull x9, w12, w8
202- ; CHECK-NEXT: mov v3.h[5], w11
203- ; CHECK-NEXT: umull x8, w16, w8
204- ; CHECK-NEXT: sub w11, w14, w14, lsl #8
205- ; CHECK-NEXT: sub w14, w15, w15, lsl #8
206- ; CHECK-NEXT: add w11, w13, w11
207- ; CHECK-NEXT: lsr x9, x9, #32
208- ; CHECK-NEXT: add w10, w10, w14
209- ; CHECK-NEXT: lsr x8, x8, #32
210- ; CHECK-NEXT: mov v2.h[6], w11
211- ; CHECK-NEXT: mov v3.h[6], w10
212- ; CHECK-NEXT: sub w9, w9, w9, lsl #8
213- ; CHECK-NEXT: sub w8, w8, w8, lsl #8
214- ; CHECK-NEXT: add w9, w12, w9
215- ; CHECK-NEXT: add w8, w16, w8
216- ; CHECK-NEXT: mov v2.h[7], w9
217- ; CHECK-NEXT: mov v3.h[7], w8
218- ; CHECK-NEXT: mov v0.16b, v2.16b
219- ; CHECK-NEXT: mov v1.16b, v3.16b
62+ ; CHECK-NEXT: mov w8, #32897 // =0x8081
63+ ; CHECK-NEXT: dup v2.8h, w8
64+ ; CHECK-NEXT: umull2 v3.4s, v0.8h, v2.8h
65+ ; CHECK-NEXT: umull v4.4s, v0.4h, v2.4h
66+ ; CHECK-NEXT: umull2 v5.4s, v1.8h, v2.8h
67+ ; CHECK-NEXT: umull v2.4s, v1.4h, v2.4h
68+ ; CHECK-NEXT: uzp2 v3.8h, v4.8h, v3.8h
69+ ; CHECK-NEXT: movi v4.2d, #0xff00ff00ff00ff
70+ ; CHECK-NEXT: uzp2 v2.8h, v2.8h, v5.8h
71+ ; CHECK-NEXT: ushr v3.8h, v3.8h, #7
72+ ; CHECK-NEXT: ushr v2.8h, v2.8h, #7
73+ ; CHECK-NEXT: mls v0.8h, v3.8h, v4.8h
74+ ; CHECK-NEXT: mls v1.8h, v2.8h, v4.8h
22075; CHECK-NEXT: ret
22176 %rem = urem <16 x i16 > %x , splat (i16 255 )
22277 ret <16 x i16 > %rem
0 commit comments