Skip to content

Commit 84a6789

Browse files
davemgreenmahesh-attarde
authored andcommitted
[ARM] Add extra mulh tests with known-bits. NFC
1 parent 3c2d886 commit 84a6789

File tree

1 file changed

+247
-12
lines changed

1 file changed

+247
-12
lines changed

llvm/test/CodeGen/Thumb2/mve-vmulh.ll

Lines changed: 247 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -790,15 +790,250 @@ entry:
790790
ret i16 %result
791791
}
792792

793-
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
794-
795-
796-
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
797-
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
798-
declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
799-
declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
800-
declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
801-
declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
802-
declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
803-
declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
804-
declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
793+
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
794+
; CHECK-LABEL: vmulhs_kb_v4i32:
795+
; CHECK: @ %bb.0: @ %entry
796+
; CHECK-NEXT: vmov.f32 s4, s2
797+
; CHECK-NEXT: vmov r1, s9
798+
; CHECK-NEXT: vmov r2, s5
799+
; CHECK-NEXT: vmov.f32 s6, s3
800+
; CHECK-NEXT: vmov.f32 s10, s1
801+
; CHECK-NEXT: vmov r0, s4
802+
; CHECK-NEXT: smmul r0, r0, r1
803+
; CHECK-NEXT: vmov r1, s0
804+
; CHECK-NEXT: smmul r1, r1, r2
805+
; CHECK-NEXT: vmov r2, s7
806+
; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
807+
; CHECK-NEXT: vmov r0, s6
808+
; CHECK-NEXT: vmov r1, s11
809+
; CHECK-NEXT: smmul r0, r0, r1
810+
; CHECK-NEXT: vmov r1, s10
811+
; CHECK-NEXT: smmul r1, r1, r2
812+
; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
813+
; CHECK-NEXT: bx lr
814+
entry:
815+
%s0s = sext <4 x i32> %s0 to <4 x i64>
816+
%s1s = ashr <4 x i64> %s1, <i64 32, i64 32, i64 32, i64 32>
817+
%m = mul <4 x i64> %s0s, %s1s
818+
%s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
819+
%s2 = trunc <4 x i64> %s to <4 x i32>
820+
ret <4 x i32> %s2
821+
}
822+
823+
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kb_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
824+
; CHECK-LABEL: vmulhu_kb_v4i32:
825+
; CHECK: @ %bb.0: @ %entry
826+
; CHECK-NEXT: vmov.f32 s4, s2
827+
; CHECK-NEXT: vmov r1, s9
828+
; CHECK-NEXT: vmov r2, s5
829+
; CHECK-NEXT: vmov.f32 s6, s3
830+
; CHECK-NEXT: vmov.f32 s10, s1
831+
; CHECK-NEXT: vmov r0, s4
832+
; CHECK-NEXT: umull r0, r1, r0, r1
833+
; CHECK-NEXT: vmov r0, s0
834+
; CHECK-NEXT: umull r0, r2, r0, r2
835+
; CHECK-NEXT: vmov r0, s6
836+
; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
837+
; CHECK-NEXT: vmov r1, s11
838+
; CHECK-NEXT: vmov r2, s7
839+
; CHECK-NEXT: umull r0, r1, r0, r1
840+
; CHECK-NEXT: vmov r0, s10
841+
; CHECK-NEXT: umull r0, r2, r0, r2
842+
; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
843+
; CHECK-NEXT: bx lr
844+
entry:
845+
%s0s = zext <4 x i32> %s0 to <4 x i64>
846+
%s1s = lshr <4 x i64> %s1, <i64 32, i64 32, i64 32, i64 32>
847+
%m = mul <4 x i64> %s0s, %s1s
848+
%s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
849+
%s2 = trunc <4 x i64> %s to <4 x i32>
850+
ret <4 x i32> %s2
851+
}
852+
853+
define arm_aapcs_vfpcc <4 x i32> @vmulhs_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
854+
; CHECK-LABEL: vmulhs_kbc_v4i32:
855+
; CHECK: @ %bb.0: @ %entry
856+
; CHECK-NEXT: vmov.f32 s4, s2
857+
; CHECK-NEXT: vmov r1, s9
858+
; CHECK-NEXT: vmov r2, s5
859+
; CHECK-NEXT: vmov.f32 s6, s3
860+
; CHECK-NEXT: vmov.f32 s10, s1
861+
; CHECK-NEXT: vmov r0, s4
862+
; CHECK-NEXT: smmul r0, r1, r0
863+
; CHECK-NEXT: vmov r1, s0
864+
; CHECK-NEXT: smmul r1, r2, r1
865+
; CHECK-NEXT: vmov r2, s7
866+
; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
867+
; CHECK-NEXT: vmov r0, s6
868+
; CHECK-NEXT: vmov r1, s11
869+
; CHECK-NEXT: smmul r0, r1, r0
870+
; CHECK-NEXT: vmov r1, s10
871+
; CHECK-NEXT: smmul r1, r2, r1
872+
; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
873+
; CHECK-NEXT: bx lr
874+
entry:
875+
%s0s = sext <4 x i32> %s0 to <4 x i64>
876+
%s1s = ashr <4 x i64> %s1, <i64 32, i64 32, i64 32, i64 32>
877+
%m = mul <4 x i64> %s1s, %s0s
878+
%s = ashr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
879+
%s2 = trunc <4 x i64> %s to <4 x i32>
880+
ret <4 x i32> %s2
881+
}
882+
883+
define arm_aapcs_vfpcc <4 x i32> @vmulhu_kbc_v4i32(<4 x i32> %s0, <4 x i64> %s1) {
884+
; CHECK-LABEL: vmulhu_kbc_v4i32:
885+
; CHECK: @ %bb.0: @ %entry
886+
; CHECK-NEXT: vmov.f32 s4, s2
887+
; CHECK-NEXT: vmov r1, s9
888+
; CHECK-NEXT: vmov r2, s5
889+
; CHECK-NEXT: vmov.f32 s6, s3
890+
; CHECK-NEXT: vmov.f32 s10, s1
891+
; CHECK-NEXT: vmov r0, s4
892+
; CHECK-NEXT: umull r0, r1, r1, r0
893+
; CHECK-NEXT: vmov r0, s0
894+
; CHECK-NEXT: umull r0, r2, r2, r0
895+
; CHECK-NEXT: vmov r0, s6
896+
; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
897+
; CHECK-NEXT: vmov r1, s11
898+
; CHECK-NEXT: vmov r2, s7
899+
; CHECK-NEXT: umull r0, r1, r1, r0
900+
; CHECK-NEXT: vmov r0, s10
901+
; CHECK-NEXT: umull r0, r2, r2, r0
902+
; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
903+
; CHECK-NEXT: bx lr
904+
entry:
905+
%s0s = zext <4 x i32> %s0 to <4 x i64>
906+
%s1s = lshr <4 x i64> %s1, <i64 32, i64 32, i64 32, i64 32>
907+
%m = mul <4 x i64> %s1s, %s0s
908+
%s = lshr <4 x i64> %m, <i64 32, i64 32, i64 32, i64 32>
909+
%s2 = trunc <4 x i64> %s to <4 x i32>
910+
ret <4 x i32> %s2
911+
}
912+
913+
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
914+
; CHECK-LABEL: vmulhs_kb_v8i16:
915+
; CHECK: @ %bb.0: @ %entry
916+
; CHECK-NEXT: .vsave {d8, d9}
917+
; CHECK-NEXT: vpush {d8, d9}
918+
; CHECK-NEXT: vmov.f32 s12, s5
919+
; CHECK-NEXT: vmovlt.s16 q4, q0
920+
; CHECK-NEXT: vmov.f32 s13, s7
921+
; CHECK-NEXT: vmovlb.s16 q0, q0
922+
; CHECK-NEXT: vmov.f32 s5, s6
923+
; CHECK-NEXT: vmov.f32 s14, s9
924+
; CHECK-NEXT: vmov.f32 s15, s11
925+
; CHECK-NEXT: vmov.f32 s6, s8
926+
; CHECK-NEXT: vshr.s32 q3, q3, #16
927+
; CHECK-NEXT: vmov.f32 s7, s10
928+
; CHECK-NEXT: vmul.i32 q3, q4, q3
929+
; CHECK-NEXT: vshr.s32 q1, q1, #16
930+
; CHECK-NEXT: vshr.u32 q3, q3, #16
931+
; CHECK-NEXT: vmul.i32 q0, q0, q1
932+
; CHECK-NEXT: vshr.u32 q0, q0, #16
933+
; CHECK-NEXT: vmovnt.i32 q0, q3
934+
; CHECK-NEXT: vpop {d8, d9}
935+
; CHECK-NEXT: bx lr
936+
entry:
937+
%s0s = sext <8 x i16> %s0 to <8 x i32>
938+
%s1s = ashr <8 x i32> %s1, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
939+
%m = mul <8 x i32> %s0s, %s1s
940+
%s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
941+
%s2 = trunc <8 x i32> %s to <8 x i16>
942+
ret <8 x i16> %s2
943+
}
944+
945+
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kb_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
946+
; CHECK-LABEL: vmulhu_kb_v8i16:
947+
; CHECK: @ %bb.0: @ %entry
948+
; CHECK-NEXT: .vsave {d8, d9}
949+
; CHECK-NEXT: vpush {d8, d9}
950+
; CHECK-NEXT: vmov.f32 s12, s5
951+
; CHECK-NEXT: vmovlt.u16 q4, q0
952+
; CHECK-NEXT: vmov.f32 s13, s7
953+
; CHECK-NEXT: vmovlb.u16 q0, q0
954+
; CHECK-NEXT: vmov.f32 s5, s6
955+
; CHECK-NEXT: vmov.f32 s14, s9
956+
; CHECK-NEXT: vmov.f32 s15, s11
957+
; CHECK-NEXT: vmov.f32 s6, s8
958+
; CHECK-NEXT: vshr.u32 q3, q3, #16
959+
; CHECK-NEXT: vmov.f32 s7, s10
960+
; CHECK-NEXT: vmul.i32 q3, q4, q3
961+
; CHECK-NEXT: vshr.u32 q1, q1, #16
962+
; CHECK-NEXT: vshr.u32 q3, q3, #16
963+
; CHECK-NEXT: vmul.i32 q0, q0, q1
964+
; CHECK-NEXT: vshr.u32 q0, q0, #16
965+
; CHECK-NEXT: vmovnt.i32 q0, q3
966+
; CHECK-NEXT: vpop {d8, d9}
967+
; CHECK-NEXT: bx lr
968+
entry:
969+
%s0s = zext <8 x i16> %s0 to <8 x i32>
970+
%s1s = lshr <8 x i32> %s1, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
971+
%m = mul <8 x i32> %s0s, %s1s
972+
%s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
973+
%s2 = trunc <8 x i32> %s to <8 x i16>
974+
ret <8 x i16> %s2
975+
}
976+
977+
define arm_aapcs_vfpcc <8 x i16> @vmulhs_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
978+
; CHECK-LABEL: vmulhs_kbc_v8i16:
979+
; CHECK: @ %bb.0: @ %entry
980+
; CHECK-NEXT: .vsave {d8, d9}
981+
; CHECK-NEXT: vpush {d8, d9}
982+
; CHECK-NEXT: vmov.f32 s12, s5
983+
; CHECK-NEXT: vmovlt.s16 q4, q0
984+
; CHECK-NEXT: vmov.f32 s13, s7
985+
; CHECK-NEXT: vmovlb.s16 q0, q0
986+
; CHECK-NEXT: vmov.f32 s5, s6
987+
; CHECK-NEXT: vmov.f32 s14, s9
988+
; CHECK-NEXT: vmov.f32 s15, s11
989+
; CHECK-NEXT: vmov.f32 s6, s8
990+
; CHECK-NEXT: vshr.s32 q3, q3, #16
991+
; CHECK-NEXT: vmov.f32 s7, s10
992+
; CHECK-NEXT: vmul.i32 q3, q3, q4
993+
; CHECK-NEXT: vshr.s32 q1, q1, #16
994+
; CHECK-NEXT: vshr.u32 q3, q3, #16
995+
; CHECK-NEXT: vmul.i32 q0, q1, q0
996+
; CHECK-NEXT: vshr.u32 q0, q0, #16
997+
; CHECK-NEXT: vmovnt.i32 q0, q3
998+
; CHECK-NEXT: vpop {d8, d9}
999+
; CHECK-NEXT: bx lr
1000+
entry:
1001+
%s0s = sext <8 x i16> %s0 to <8 x i32>
1002+
%s1s = ashr <8 x i32> %s1, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1003+
%m = mul <8 x i32> %s1s, %s0s
1004+
%s = ashr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1005+
%s2 = trunc <8 x i32> %s to <8 x i16>
1006+
ret <8 x i16> %s2
1007+
}
1008+
1009+
define arm_aapcs_vfpcc <8 x i16> @vmulhu_kbc_v8i16(<8 x i16> %s0, <8 x i32> %s1) {
1010+
; CHECK-LABEL: vmulhu_kbc_v8i16:
1011+
; CHECK: @ %bb.0: @ %entry
1012+
; CHECK-NEXT: .vsave {d8, d9}
1013+
; CHECK-NEXT: vpush {d8, d9}
1014+
; CHECK-NEXT: vmov.f32 s12, s5
1015+
; CHECK-NEXT: vmovlt.u16 q4, q0
1016+
; CHECK-NEXT: vmov.f32 s13, s7
1017+
; CHECK-NEXT: vmovlb.u16 q0, q0
1018+
; CHECK-NEXT: vmov.f32 s5, s6
1019+
; CHECK-NEXT: vmov.f32 s14, s9
1020+
; CHECK-NEXT: vmov.f32 s15, s11
1021+
; CHECK-NEXT: vmov.f32 s6, s8
1022+
; CHECK-NEXT: vshr.u32 q3, q3, #16
1023+
; CHECK-NEXT: vmov.f32 s7, s10
1024+
; CHECK-NEXT: vmul.i32 q3, q3, q4
1025+
; CHECK-NEXT: vshr.u32 q1, q1, #16
1026+
; CHECK-NEXT: vshr.u32 q3, q3, #16
1027+
; CHECK-NEXT: vmul.i32 q0, q1, q0
1028+
; CHECK-NEXT: vshr.u32 q0, q0, #16
1029+
; CHECK-NEXT: vmovnt.i32 q0, q3
1030+
; CHECK-NEXT: vpop {d8, d9}
1031+
; CHECK-NEXT: bx lr
1032+
entry:
1033+
%s0s = zext <8 x i16> %s0 to <8 x i32>
1034+
%s1s = lshr <8 x i32> %s1, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1035+
%m = mul <8 x i32> %s1s, %s0s
1036+
%s = lshr <8 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
1037+
%s2 = trunc <8 x i32> %s to <8 x i16>
1038+
ret <8 x i16> %s2
1039+
}

0 commit comments

Comments
 (0)