@@ -166,6 +166,129 @@ entry:
166
166
ret i64 %z
167
167
}
168
168
169
+ define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext (<8 x i16 > %x , <8 x i8 > %y ) {
170
+ ; CHECK-LABEL: add_v8i8i16_v8i64_zext:
171
+ ; CHECK: @ %bb.0: @ %entry
172
+ ; CHECK-NEXT: .vsave {d8, d9}
173
+ ; CHECK-NEXT: vpush {d8, d9}
174
+ ; CHECK-NEXT: vmovlb.u8 q1, q1
175
+ ; CHECK-NEXT: vmov.u16 r0, q0[1]
176
+ ; CHECK-NEXT: vmov.u16 r1, q0[0]
177
+ ; CHECK-NEXT: vmov.u16 r2, q1[0]
178
+ ; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
179
+ ; CHECK-NEXT: vmov.u16 r1, q1[1]
180
+ ; CHECK-NEXT: vmov.i64 q2, #0xffff
181
+ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r1
182
+ ; CHECK-NEXT: vand q3, q3, q2
183
+ ; CHECK-NEXT: vand q4, q4, q2
184
+ ; CHECK-NEXT: vmov r0, s14
185
+ ; CHECK-NEXT: vmov r1, s18
186
+ ; CHECK-NEXT: vmov r2, s12
187
+ ; CHECK-NEXT: vmov r3, s16
188
+ ; CHECK-NEXT: umull r0, r1, r0, r1
189
+ ; CHECK-NEXT: umlal r0, r1, r2, r3
190
+ ; CHECK-NEXT: vmov.u16 r2, q0[3]
191
+ ; CHECK-NEXT: vmov.u16 r3, q0[2]
192
+ ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
193
+ ; CHECK-NEXT: vmov.u16 r3, q1[3]
194
+ ; CHECK-NEXT: vmov.u16 r2, q1[2]
195
+ ; CHECK-NEXT: vand q3, q3, q2
196
+ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
197
+ ; CHECK-NEXT: vmov r12, s12
198
+ ; CHECK-NEXT: vand q4, q4, q2
199
+ ; CHECK-NEXT: vmov r2, s16
200
+ ; CHECK-NEXT: vmov r3, s18
201
+ ; CHECK-NEXT: umlal r0, r1, r12, r2
202
+ ; CHECK-NEXT: vmov r2, s14
203
+ ; CHECK-NEXT: umull r2, r3, r2, r3
204
+ ; CHECK-NEXT: adds r0, r0, r2
205
+ ; CHECK-NEXT: vmov.u16 r2, q0[5]
206
+ ; CHECK-NEXT: adcs r1, r3
207
+ ; CHECK-NEXT: vmov.u16 r3, q0[4]
208
+ ; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
209
+ ; CHECK-NEXT: vmov.u16 r3, q1[5]
210
+ ; CHECK-NEXT: vmov.u16 r2, q1[4]
211
+ ; CHECK-NEXT: vand q3, q3, q2
212
+ ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
213
+ ; CHECK-NEXT: vmov r12, s12
214
+ ; CHECK-NEXT: vand q4, q4, q2
215
+ ; CHECK-NEXT: vmov r2, s16
216
+ ; CHECK-NEXT: vmov r3, s18
217
+ ; CHECK-NEXT: umlal r0, r1, r12, r2
218
+ ; CHECK-NEXT: vmov r2, s14
219
+ ; CHECK-NEXT: umull r2, r3, r2, r3
220
+ ; CHECK-NEXT: adds r0, r0, r2
221
+ ; CHECK-NEXT: vmov.u16 r2, q0[7]
222
+ ; CHECK-NEXT: adcs r1, r3
223
+ ; CHECK-NEXT: vmov.u16 r3, q0[6]
224
+ ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
225
+ ; CHECK-NEXT: vmov.u16 r3, q1[7]
226
+ ; CHECK-NEXT: vmov.u16 r2, q1[6]
227
+ ; CHECK-NEXT: vand q0, q0, q2
228
+ ; CHECK-NEXT: vmov q1[2], q1[0], r2, r3
229
+ ; CHECK-NEXT: vmov r12, s0
230
+ ; CHECK-NEXT: vand q1, q1, q2
231
+ ; CHECK-NEXT: vmov r2, s4
232
+ ; CHECK-NEXT: vmov r3, s6
233
+ ; CHECK-NEXT: umlal r0, r1, r12, r2
234
+ ; CHECK-NEXT: vmov r2, s2
235
+ ; CHECK-NEXT: umull r2, r3, r2, r3
236
+ ; CHECK-NEXT: adds r0, r0, r2
237
+ ; CHECK-NEXT: adcs r1, r3
238
+ ; CHECK-NEXT: vpop {d8, d9}
239
+ ; CHECK-NEXT: bx lr
240
+ entry:
241
+ %xx = zext <8 x i16 > %x to <8 x i64 >
242
+ %yy = zext <8 x i8 > %y to <8 x i64 >
243
+ %m = mul <8 x i64 > %xx , %yy
244
+ %z = call i64 @llvm.vector.reduce.add.v8i64 (<8 x i64 > %m )
245
+ ret i64 %z
246
+ }
247
+
248
+ define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext (<8 x i16 > %x , <8 x i8 > %y ) {
249
+ ; CHECK-LABEL: add_v8i8i16_v8i64_sext:
250
+ ; CHECK: @ %bb.0: @ %entry
251
+ ; CHECK-NEXT: vmov.u16 r1, q1[1]
252
+ ; CHECK-NEXT: vmov.s16 r0, q0[1]
253
+ ; CHECK-NEXT: sxtb r1, r1
254
+ ; CHECK-NEXT: vmov.u16 r3, q1[0]
255
+ ; CHECK-NEXT: smull r0, r1, r0, r1
256
+ ; CHECK-NEXT: vmov.s16 r2, q0[0]
257
+ ; CHECK-NEXT: sxtb r3, r3
258
+ ; CHECK-NEXT: smlal r0, r1, r2, r3
259
+ ; CHECK-NEXT: vmov.u16 r3, q1[2]
260
+ ; CHECK-NEXT: vmov.s16 r2, q0[2]
261
+ ; CHECK-NEXT: sxtb r3, r3
262
+ ; CHECK-NEXT: smlal r0, r1, r2, r3
263
+ ; CHECK-NEXT: vmov.u16 r3, q1[3]
264
+ ; CHECK-NEXT: vmov.s16 r2, q0[3]
265
+ ; CHECK-NEXT: sxtb r3, r3
266
+ ; CHECK-NEXT: smlal r0, r1, r2, r3
267
+ ; CHECK-NEXT: vmov.u16 r3, q1[4]
268
+ ; CHECK-NEXT: vmov.s16 r2, q0[4]
269
+ ; CHECK-NEXT: sxtb r3, r3
270
+ ; CHECK-NEXT: smlal r0, r1, r2, r3
271
+ ; CHECK-NEXT: vmov.u16 r3, q1[5]
272
+ ; CHECK-NEXT: vmov.s16 r2, q0[5]
273
+ ; CHECK-NEXT: sxtb r3, r3
274
+ ; CHECK-NEXT: smlal r0, r1, r2, r3
275
+ ; CHECK-NEXT: vmov.u16 r3, q1[6]
276
+ ; CHECK-NEXT: vmov.s16 r2, q0[6]
277
+ ; CHECK-NEXT: sxtb r3, r3
278
+ ; CHECK-NEXT: smlal r0, r1, r2, r3
279
+ ; CHECK-NEXT: vmov.u16 r3, q1[7]
280
+ ; CHECK-NEXT: vmov.s16 r2, q0[7]
281
+ ; CHECK-NEXT: sxtb r3, r3
282
+ ; CHECK-NEXT: smlal r0, r1, r2, r3
283
+ ; CHECK-NEXT: bx lr
284
+ entry:
285
+ %xx = sext <8 x i16 > %x to <8 x i64 >
286
+ %yy = sext <8 x i8 > %y to <8 x i64 >
287
+ %m = mul <8 x i64 > %xx , %yy
288
+ %z = call i64 @llvm.vector.reduce.add.v8i64 (<8 x i64 > %m )
289
+ ret i64 %z
290
+ }
291
+
169
292
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext (<4 x i16 > %x , <4 x i16 > %y ) {
170
293
; CHECK-LABEL: add_v4i16_v4i64_zext:
171
294
; CHECK: @ %bb.0: @ %entry
@@ -383,6 +506,73 @@ entry:
383
506
ret i32 %z
384
507
}
385
508
509
+ define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext (<8 x i8 > %x , <8 x i16 > %y ) {
510
+ ; CHECK-LABEL: add_v8i8i16_v8i32_zext:
511
+ ; CHECK: @ %bb.0: @ %entry
512
+ ; CHECK-NEXT: .pad #32
513
+ ; CHECK-NEXT: sub sp, #32
514
+ ; CHECK-NEXT: mov r0, sp
515
+ ; CHECK-NEXT: vmovlb.u8 q0, q0
516
+ ; CHECK-NEXT: add r1, sp, #16
517
+ ; CHECK-NEXT: vstrw.32 q1, [r0]
518
+ ; CHECK-NEXT: vstrw.32 q0, [r1]
519
+ ; CHECK-NEXT: vldrh.u32 q0, [r0, #8]
520
+ ; CHECK-NEXT: vldrh.u32 q1, [r1, #8]
521
+ ; CHECK-NEXT: vldrh.u32 q2, [r1]
522
+ ; CHECK-NEXT: vmul.i32 q0, q1, q0
523
+ ; CHECK-NEXT: vldrh.u32 q1, [r0]
524
+ ; CHECK-NEXT: vmul.i32 q1, q2, q1
525
+ ; CHECK-NEXT: vadd.i32 q0, q1, q0
526
+ ; CHECK-NEXT: vaddv.u32 r0, q0
527
+ ; CHECK-NEXT: add sp, #32
528
+ ; CHECK-NEXT: bx lr
529
+ entry:
530
+ %xx = zext <8 x i8 > %x to <8 x i32 >
531
+ %yy = zext <8 x i16 > %y to <8 x i32 >
532
+ %m = mul <8 x i32 > %xx , %yy
533
+ %z = call i32 @llvm.vector.reduce.add.v8i32 (<8 x i32 > %m )
534
+ ret i32 %z
535
+ }
536
+
537
+ define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext (<8 x i8 > %x , <8 x i16 > %y ) {
538
+ ; CHECK-LABEL: add_v8i8i16_v8i32_sext:
539
+ ; CHECK: @ %bb.0: @ %entry
540
+ ; CHECK-NEXT: .pad #16
541
+ ; CHECK-NEXT: sub sp, #16
542
+ ; CHECK-NEXT: mov r0, sp
543
+ ; CHECK-NEXT: vmov.u16 r1, q0[6]
544
+ ; CHECK-NEXT: vmov.u16 r2, q0[4]
545
+ ; CHECK-NEXT: vstrw.32 q1, [r0]
546
+ ; CHECK-NEXT: vmov q1[2], q1[0], r2, r1
547
+ ; CHECK-NEXT: vmov.u16 r1, q0[7]
548
+ ; CHECK-NEXT: vmov.u16 r2, q0[5]
549
+ ; CHECK-NEXT: vldrh.s32 q2, [r0, #8]
550
+ ; CHECK-NEXT: vmov q1[3], q1[1], r2, r1
551
+ ; CHECK-NEXT: vmov.u16 r1, q0[2]
552
+ ; CHECK-NEXT: vmovlb.s8 q1, q1
553
+ ; CHECK-NEXT: vmov.u16 r2, q0[0]
554
+ ; CHECK-NEXT: vmovlb.s16 q1, q1
555
+ ; CHECK-NEXT: vmul.i32 q1, q1, q2
556
+ ; CHECK-NEXT: vmov q2[2], q2[0], r2, r1
557
+ ; CHECK-NEXT: vmov.u16 r1, q0[3]
558
+ ; CHECK-NEXT: vmov.u16 r2, q0[1]
559
+ ; CHECK-NEXT: vmov q2[3], q2[1], r2, r1
560
+ ; CHECK-NEXT: vmovlb.s8 q0, q2
561
+ ; CHECK-NEXT: vldrh.s32 q2, [r0]
562
+ ; CHECK-NEXT: vmovlb.s16 q0, q0
563
+ ; CHECK-NEXT: vmul.i32 q0, q0, q2
564
+ ; CHECK-NEXT: vadd.i32 q0, q0, q1
565
+ ; CHECK-NEXT: vaddv.u32 r0, q0
566
+ ; CHECK-NEXT: add sp, #16
567
+ ; CHECK-NEXT: bx lr
568
+ entry:
569
+ %xx = sext <8 x i8 > %x to <8 x i32 >
570
+ %yy = sext <8 x i16 > %y to <8 x i32 >
571
+ %m = mul <8 x i32 > %xx , %yy
572
+ %z = call i32 @llvm.vector.reduce.add.v8i32 (<8 x i32 > %m )
573
+ ret i32 %z
574
+ }
575
+
386
576
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext (<16 x i8 > %x , <16 x i8 > %y ) {
387
577
; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
388
578
; CHECK: @ %bb.0: @ %entry
@@ -457,6 +647,23 @@ entry:
457
647
ret i32 %z
458
648
}
459
649
650
+ define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext (<4 x i8 > %x , <4 x i8 > %y ) {
651
+ ; CHECK-LABEL: add_v4i8_v4i32_szext:
652
+ ; CHECK: @ %bb.0: @ %entry
653
+ ; CHECK-NEXT: vmovlb.s8 q0, q0
654
+ ; CHECK-NEXT: vmov.i32 q2, #0xff
655
+ ; CHECK-NEXT: vand q1, q1, q2
656
+ ; CHECK-NEXT: vmovlb.s16 q0, q0
657
+ ; CHECK-NEXT: vmlav.u32 r0, q0, q1
658
+ ; CHECK-NEXT: bx lr
659
+ entry:
660
+ %xx = sext <4 x i8 > %x to <4 x i32 >
661
+ %yy = zext <4 x i8 > %y to <4 x i32 >
662
+ %m = mul <4 x i32 > %xx , %yy
663
+ %z = call i32 @llvm.vector.reduce.add.v4i32 (<4 x i32 > %m )
664
+ ret i32 %z
665
+ }
666
+
460
667
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext (<16 x i8 > %x , <16 x i8 > %y ) {
461
668
; CHECK-LABEL: add_v16i8_v16i16_zext:
462
669
; CHECK: @ %bb.0: @ %entry
@@ -485,6 +692,20 @@ entry:
485
692
ret i16 %z
486
693
}
487
694
695
+ define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext (<16 x i8 > %x , <16 x i8 > %y ) {
696
+ ; CHECK-LABEL: add_v16i8_v16i16_szext:
697
+ ; CHECK: @ %bb.0: @ %entry
698
+ ; CHECK-NEXT: vmlav.s8 r0, q0, q1
699
+ ; CHECK-NEXT: sxth r0, r0
700
+ ; CHECK-NEXT: bx lr
701
+ entry:
702
+ %xx = sext <16 x i8 > %x to <16 x i16 >
703
+ %yy = zext <16 x i8 > %y to <16 x i16 >
704
+ %m = mul <16 x i16 > %xx , %yy
705
+ %z = call i16 @llvm.vector.reduce.add.v16i16 (<16 x i16 > %m )
706
+ ret i16 %z
707
+ }
708
+
488
709
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext (<8 x i8 > %x , <8 x i8 > %y ) {
489
710
; CHECK-LABEL: add_v8i8_v8i16_zext:
490
711
; CHECK: @ %bb.0: @ %entry
@@ -686,6 +907,120 @@ entry:
686
907
ret i64 %z
687
908
}
688
909
910
+ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext (<4 x i8 > %x , <4 x i16 > %y ) {
911
+ ; CHECK-LABEL: add_v4i8i16_v4i64_zext:
912
+ ; CHECK: @ %bb.0: @ %entry
913
+ ; CHECK-NEXT: .vsave {d8, d9}
914
+ ; CHECK-NEXT: vpush {d8, d9}
915
+ ; CHECK-NEXT: vmov.i32 q2, #0xff
916
+ ; CHECK-NEXT: vmovlb.u16 q1, q1
917
+ ; CHECK-NEXT: vand q0, q0, q2
918
+ ; CHECK-NEXT: vmov.f32 s12, s4
919
+ ; CHECK-NEXT: vmov.f32 s8, s0
920
+ ; CHECK-NEXT: vmov.f32 s10, s1
921
+ ; CHECK-NEXT: vmov.f32 s14, s5
922
+ ; CHECK-NEXT: vmullb.u32 q4, q2, q3
923
+ ; CHECK-NEXT: vmov.f32 s8, s2
924
+ ; CHECK-NEXT: vmov r0, r1, d9
925
+ ; CHECK-NEXT: vmov r2, r3, d8
926
+ ; CHECK-NEXT: vmov.f32 s10, s3
927
+ ; CHECK-NEXT: vmov.f32 s0, s6
928
+ ; CHECK-NEXT: vmov.f32 s2, s7
929
+ ; CHECK-NEXT: vmullb.u32 q1, q2, q0
930
+ ; CHECK-NEXT: adds r0, r0, r2
931
+ ; CHECK-NEXT: adcs r1, r3
932
+ ; CHECK-NEXT: vmov r2, r3, d2
933
+ ; CHECK-NEXT: adds r0, r0, r2
934
+ ; CHECK-NEXT: adcs r1, r3
935
+ ; CHECK-NEXT: vmov r2, r3, d3
936
+ ; CHECK-NEXT: adds r0, r0, r2
937
+ ; CHECK-NEXT: adcs r1, r3
938
+ ; CHECK-NEXT: vpop {d8, d9}
939
+ ; CHECK-NEXT: bx lr
940
+ entry:
941
+ %xx = zext <4 x i8 > %x to <4 x i64 >
942
+ %yy = zext <4 x i16 > %y to <4 x i64 >
943
+ %m = mul <4 x i64 > %xx , %yy
944
+ %z = call i64 @llvm.vector.reduce.add.v4i64 (<4 x i64 > %m )
945
+ ret i64 %z
946
+ }
947
+
948
+ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext (<4 x i8 > %x , <4 x i16 > %y ) {
949
+ ; CHECK-LABEL: add_v4i8i16_v4i64_sext:
950
+ ; CHECK: @ %bb.0: @ %entry
951
+ ; CHECK-NEXT: vmov.f32 s8, s4
952
+ ; CHECK-NEXT: vmov.f32 s10, s5
953
+ ; CHECK-NEXT: vmov r2, s4
954
+ ; CHECK-NEXT: vmov r3, s0
955
+ ; CHECK-NEXT: vmov r0, s10
956
+ ; CHECK-NEXT: vmov.f32 s8, s0
957
+ ; CHECK-NEXT: vmov.f32 s10, s1
958
+ ; CHECK-NEXT: vmov r1, s10
959
+ ; CHECK-NEXT: vmov.f32 s8, s6
960
+ ; CHECK-NEXT: vmov.f32 s10, s7
961
+ ; CHECK-NEXT: vmov.f32 s4, s2
962
+ ; CHECK-NEXT: vmov.f32 s6, s3
963
+ ; CHECK-NEXT: sxth r2, r2
964
+ ; CHECK-NEXT: sxtb r3, r3
965
+ ; CHECK-NEXT: sxth r0, r0
966
+ ; CHECK-NEXT: sxtb r1, r1
967
+ ; CHECK-NEXT: smull r0, r1, r1, r0
968
+ ; CHECK-NEXT: smlal r0, r1, r3, r2
969
+ ; CHECK-NEXT: vmov r2, s8
970
+ ; CHECK-NEXT: vmov r3, s4
971
+ ; CHECK-NEXT: sxth r2, r2
972
+ ; CHECK-NEXT: sxtb r3, r3
973
+ ; CHECK-NEXT: smlal r0, r1, r3, r2
974
+ ; CHECK-NEXT: vmov r2, s10
975
+ ; CHECK-NEXT: vmov r3, s6
976
+ ; CHECK-NEXT: sxth r2, r2
977
+ ; CHECK-NEXT: sxtb r3, r3
978
+ ; CHECK-NEXT: smlal r0, r1, r3, r2
979
+ ; CHECK-NEXT: bx lr
980
+ entry:
981
+ %xx = sext <4 x i8 > %x to <4 x i64 >
982
+ %yy = sext <4 x i16 > %y to <4 x i64 >
983
+ %m = mul <4 x i64 > %xx , %yy
984
+ %z = call i64 @llvm.vector.reduce.add.v4i64 (<4 x i64 > %m )
985
+ ret i64 %z
986
+ }
987
+
988
+ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext (<4 x i8 > %x , <4 x i16 > %y ) {
989
+ ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext:
990
+ ; CHECK: @ %bb.0: @ %entry
991
+ ; CHECK-NEXT: vmov.i32 q2, #0xff
992
+ ; CHECK-NEXT: vmovlb.u16 q1, q1
993
+ ; CHECK-NEXT: vand q0, q0, q2
994
+ ; CHECK-NEXT: vmul.i32 q0, q0, q1
995
+ ; CHECK-NEXT: vaddlv.u32 r0, r1, q0
996
+ ; CHECK-NEXT: bx lr
997
+ entry:
998
+ %xx = zext <4 x i8 > %x to <4 x i32 >
999
+ %yy = zext <4 x i16 > %y to <4 x i32 >
1000
+ %mm = mul <4 x i32 > %xx , %yy
1001
+ %m = zext <4 x i32 > %mm to <4 x i64 >
1002
+ %z = call i64 @llvm.vector.reduce.add.v4i64 (<4 x i64 > %m )
1003
+ ret i64 %z
1004
+ }
1005
+
1006
+ define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext (<4 x i8 > %x , <4 x i16 > %y ) {
1007
+ ; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext:
1008
+ ; CHECK: @ %bb.0: @ %entry
1009
+ ; CHECK-NEXT: vmovlb.s8 q0, q0
1010
+ ; CHECK-NEXT: vmovlb.s16 q1, q1
1011
+ ; CHECK-NEXT: vmovlb.s16 q0, q0
1012
+ ; CHECK-NEXT: vmul.i32 q0, q0, q1
1013
+ ; CHECK-NEXT: vaddlv.s32 r0, r1, q0
1014
+ ; CHECK-NEXT: bx lr
1015
+ entry:
1016
+ %xx = sext <4 x i8 > %x to <4 x i32 >
1017
+ %yy = sext <4 x i16 > %y to <4 x i32 >
1018
+ %mm = mul <4 x i32 > %xx , %yy
1019
+ %m = sext <4 x i32 > %mm to <4 x i64 >
1020
+ %z = call i64 @llvm.vector.reduce.add.v4i64 (<4 x i64 > %m )
1021
+ ret i64 %z
1022
+ }
1023
+
689
1024
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext (<2 x i8 > %x , <2 x i8 > %y ) {
690
1025
; CHECK-LABEL: add_v2i8_v2i64_zext:
691
1026
; CHECK: @ %bb.0: @ %entry
0 commit comments