@@ -790,15 +790,250 @@ entry:
790
790
ret i16 %result
791
791
}
792
792
793
- declare i16 @llvm.vector.reduce.add.v16i16 (<16 x i16 >)
794
-
795
-
796
- declare <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 , i32 )
797
- declare <4 x i32 > @llvm.masked.load.v4i32.p0 (ptr , i32 immarg, <4 x i1 >, <4 x i32 >)
798
- declare void @llvm.masked.store.v4i32.p0 (<4 x i32 >, ptr , i32 immarg, <4 x i1 >)
799
- declare <8 x i1 > @llvm.get.active.lane.mask.v8i1.i32 (i32 , i32 )
800
- declare <8 x i16 > @llvm.masked.load.v8i16.p0 (ptr , i32 immarg, <8 x i1 >, <8 x i16 >)
801
- declare void @llvm.masked.store.v8i16.p0 (<8 x i16 >, ptr , i32 immarg, <8 x i1 >)
802
- declare <16 x i1 > @llvm.get.active.lane.mask.v16i1.i32 (i32 , i32 )
803
- declare <16 x i8 > @llvm.masked.load.v16i8.p0 (ptr , i32 immarg, <16 x i1 >, <16 x i8 >)
804
- declare void @llvm.masked.store.v16i8.p0 (<16 x i8 >, ptr , i32 immarg, <16 x i1 >)
793
+ define arm_aapcs_vfpcc <4 x i32 > @vmulhs_kb_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
794
+ ; CHECK-LABEL: vmulhs_kb_v4i32:
795
+ ; CHECK: @ %bb.0: @ %entry
796
+ ; CHECK-NEXT: vmov.f32 s4, s2
797
+ ; CHECK-NEXT: vmov r1, s9
798
+ ; CHECK-NEXT: vmov r2, s5
799
+ ; CHECK-NEXT: vmov.f32 s6, s3
800
+ ; CHECK-NEXT: vmov.f32 s10, s1
801
+ ; CHECK-NEXT: vmov r0, s4
802
+ ; CHECK-NEXT: smmul r0, r0, r1
803
+ ; CHECK-NEXT: vmov r1, s0
804
+ ; CHECK-NEXT: smmul r1, r1, r2
805
+ ; CHECK-NEXT: vmov r2, s7
806
+ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
807
+ ; CHECK-NEXT: vmov r0, s6
808
+ ; CHECK-NEXT: vmov r1, s11
809
+ ; CHECK-NEXT: smmul r0, r0, r1
810
+ ; CHECK-NEXT: vmov r1, s10
811
+ ; CHECK-NEXT: smmul r1, r1, r2
812
+ ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
813
+ ; CHECK-NEXT: bx lr
814
+ entry:
815
+ %s0s = sext <4 x i32 > %s0 to <4 x i64 >
816
+ %s1s = ashr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
817
+ %m = mul <4 x i64 > %s0s , %s1s
818
+ %s = ashr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
819
+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
820
+ ret <4 x i32 > %s2
821
+ }
822
+
823
+ define arm_aapcs_vfpcc <4 x i32 > @vmulhu_kb_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
824
+ ; CHECK-LABEL: vmulhu_kb_v4i32:
825
+ ; CHECK: @ %bb.0: @ %entry
826
+ ; CHECK-NEXT: vmov.f32 s4, s2
827
+ ; CHECK-NEXT: vmov r1, s9
828
+ ; CHECK-NEXT: vmov r2, s5
829
+ ; CHECK-NEXT: vmov.f32 s6, s3
830
+ ; CHECK-NEXT: vmov.f32 s10, s1
831
+ ; CHECK-NEXT: vmov r0, s4
832
+ ; CHECK-NEXT: umull r0, r1, r0, r1
833
+ ; CHECK-NEXT: vmov r0, s0
834
+ ; CHECK-NEXT: umull r0, r2, r0, r2
835
+ ; CHECK-NEXT: vmov r0, s6
836
+ ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
837
+ ; CHECK-NEXT: vmov r1, s11
838
+ ; CHECK-NEXT: vmov r2, s7
839
+ ; CHECK-NEXT: umull r0, r1, r0, r1
840
+ ; CHECK-NEXT: vmov r0, s10
841
+ ; CHECK-NEXT: umull r0, r2, r0, r2
842
+ ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
843
+ ; CHECK-NEXT: bx lr
844
+ entry:
845
+ %s0s = zext <4 x i32 > %s0 to <4 x i64 >
846
+ %s1s = lshr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
847
+ %m = mul <4 x i64 > %s0s , %s1s
848
+ %s = lshr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
849
+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
850
+ ret <4 x i32 > %s2
851
+ }
852
+
853
+ define arm_aapcs_vfpcc <4 x i32 > @vmulhs_kbc_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
854
+ ; CHECK-LABEL: vmulhs_kbc_v4i32:
855
+ ; CHECK: @ %bb.0: @ %entry
856
+ ; CHECK-NEXT: vmov.f32 s4, s2
857
+ ; CHECK-NEXT: vmov r1, s9
858
+ ; CHECK-NEXT: vmov r2, s5
859
+ ; CHECK-NEXT: vmov.f32 s6, s3
860
+ ; CHECK-NEXT: vmov.f32 s10, s1
861
+ ; CHECK-NEXT: vmov r0, s4
862
+ ; CHECK-NEXT: smmul r0, r1, r0
863
+ ; CHECK-NEXT: vmov r1, s0
864
+ ; CHECK-NEXT: smmul r1, r2, r1
865
+ ; CHECK-NEXT: vmov r2, s7
866
+ ; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
867
+ ; CHECK-NEXT: vmov r0, s6
868
+ ; CHECK-NEXT: vmov r1, s11
869
+ ; CHECK-NEXT: smmul r0, r1, r0
870
+ ; CHECK-NEXT: vmov r1, s10
871
+ ; CHECK-NEXT: smmul r1, r2, r1
872
+ ; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
873
+ ; CHECK-NEXT: bx lr
874
+ entry:
875
+ %s0s = sext <4 x i32 > %s0 to <4 x i64 >
876
+ %s1s = ashr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
877
+ %m = mul <4 x i64 > %s1s , %s0s
878
+ %s = ashr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
879
+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
880
+ ret <4 x i32 > %s2
881
+ }
882
+
883
+ define arm_aapcs_vfpcc <4 x i32 > @vmulhu_kbc_v4i32 (<4 x i32 > %s0 , <4 x i64 > %s1 ) {
884
+ ; CHECK-LABEL: vmulhu_kbc_v4i32:
885
+ ; CHECK: @ %bb.0: @ %entry
886
+ ; CHECK-NEXT: vmov.f32 s4, s2
887
+ ; CHECK-NEXT: vmov r1, s9
888
+ ; CHECK-NEXT: vmov r2, s5
889
+ ; CHECK-NEXT: vmov.f32 s6, s3
890
+ ; CHECK-NEXT: vmov.f32 s10, s1
891
+ ; CHECK-NEXT: vmov r0, s4
892
+ ; CHECK-NEXT: umull r0, r1, r1, r0
893
+ ; CHECK-NEXT: vmov r0, s0
894
+ ; CHECK-NEXT: umull r0, r2, r2, r0
895
+ ; CHECK-NEXT: vmov r0, s6
896
+ ; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
897
+ ; CHECK-NEXT: vmov r1, s11
898
+ ; CHECK-NEXT: vmov r2, s7
899
+ ; CHECK-NEXT: umull r0, r1, r1, r0
900
+ ; CHECK-NEXT: vmov r0, s10
901
+ ; CHECK-NEXT: umull r0, r2, r2, r0
902
+ ; CHECK-NEXT: vmov q0[3], q0[1], r2, r1
903
+ ; CHECK-NEXT: bx lr
904
+ entry:
905
+ %s0s = zext <4 x i32 > %s0 to <4 x i64 >
906
+ %s1s = lshr <4 x i64 > %s1 , <i64 32 , i64 32 , i64 32 , i64 32 >
907
+ %m = mul <4 x i64 > %s1s , %s0s
908
+ %s = lshr <4 x i64 > %m , <i64 32 , i64 32 , i64 32 , i64 32 >
909
+ %s2 = trunc <4 x i64 > %s to <4 x i32 >
910
+ ret <4 x i32 > %s2
911
+ }
912
+
913
+ define arm_aapcs_vfpcc <8 x i16 > @vmulhs_kb_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
914
+ ; CHECK-LABEL: vmulhs_kb_v8i16:
915
+ ; CHECK: @ %bb.0: @ %entry
916
+ ; CHECK-NEXT: .vsave {d8, d9}
917
+ ; CHECK-NEXT: vpush {d8, d9}
918
+ ; CHECK-NEXT: vmov.f32 s12, s5
919
+ ; CHECK-NEXT: vmovlt.s16 q4, q0
920
+ ; CHECK-NEXT: vmov.f32 s13, s7
921
+ ; CHECK-NEXT: vmovlb.s16 q0, q0
922
+ ; CHECK-NEXT: vmov.f32 s5, s6
923
+ ; CHECK-NEXT: vmov.f32 s14, s9
924
+ ; CHECK-NEXT: vmov.f32 s15, s11
925
+ ; CHECK-NEXT: vmov.f32 s6, s8
926
+ ; CHECK-NEXT: vshr.s32 q3, q3, #16
927
+ ; CHECK-NEXT: vmov.f32 s7, s10
928
+ ; CHECK-NEXT: vmul.i32 q3, q4, q3
929
+ ; CHECK-NEXT: vshr.s32 q1, q1, #16
930
+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
931
+ ; CHECK-NEXT: vmul.i32 q0, q0, q1
932
+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
933
+ ; CHECK-NEXT: vmovnt.i32 q0, q3
934
+ ; CHECK-NEXT: vpop {d8, d9}
935
+ ; CHECK-NEXT: bx lr
936
+ entry:
937
+ %s0s = sext <8 x i16 > %s0 to <8 x i32 >
938
+ %s1s = ashr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
939
+ %m = mul <8 x i32 > %s0s , %s1s
940
+ %s = ashr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
941
+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
942
+ ret <8 x i16 > %s2
943
+ }
944
+
945
+ define arm_aapcs_vfpcc <8 x i16 > @vmulhu_kb_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
946
+ ; CHECK-LABEL: vmulhu_kb_v8i16:
947
+ ; CHECK: @ %bb.0: @ %entry
948
+ ; CHECK-NEXT: .vsave {d8, d9}
949
+ ; CHECK-NEXT: vpush {d8, d9}
950
+ ; CHECK-NEXT: vmov.f32 s12, s5
951
+ ; CHECK-NEXT: vmovlt.u16 q4, q0
952
+ ; CHECK-NEXT: vmov.f32 s13, s7
953
+ ; CHECK-NEXT: vmovlb.u16 q0, q0
954
+ ; CHECK-NEXT: vmov.f32 s5, s6
955
+ ; CHECK-NEXT: vmov.f32 s14, s9
956
+ ; CHECK-NEXT: vmov.f32 s15, s11
957
+ ; CHECK-NEXT: vmov.f32 s6, s8
958
+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
959
+ ; CHECK-NEXT: vmov.f32 s7, s10
960
+ ; CHECK-NEXT: vmul.i32 q3, q4, q3
961
+ ; CHECK-NEXT: vshr.u32 q1, q1, #16
962
+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
963
+ ; CHECK-NEXT: vmul.i32 q0, q0, q1
964
+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
965
+ ; CHECK-NEXT: vmovnt.i32 q0, q3
966
+ ; CHECK-NEXT: vpop {d8, d9}
967
+ ; CHECK-NEXT: bx lr
968
+ entry:
969
+ %s0s = zext <8 x i16 > %s0 to <8 x i32 >
970
+ %s1s = lshr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
971
+ %m = mul <8 x i32 > %s0s , %s1s
972
+ %s = lshr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
973
+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
974
+ ret <8 x i16 > %s2
975
+ }
976
+
977
+ define arm_aapcs_vfpcc <8 x i16 > @vmulhs_kbc_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
978
+ ; CHECK-LABEL: vmulhs_kbc_v8i16:
979
+ ; CHECK: @ %bb.0: @ %entry
980
+ ; CHECK-NEXT: .vsave {d8, d9}
981
+ ; CHECK-NEXT: vpush {d8, d9}
982
+ ; CHECK-NEXT: vmov.f32 s12, s5
983
+ ; CHECK-NEXT: vmovlt.s16 q4, q0
984
+ ; CHECK-NEXT: vmov.f32 s13, s7
985
+ ; CHECK-NEXT: vmovlb.s16 q0, q0
986
+ ; CHECK-NEXT: vmov.f32 s5, s6
987
+ ; CHECK-NEXT: vmov.f32 s14, s9
988
+ ; CHECK-NEXT: vmov.f32 s15, s11
989
+ ; CHECK-NEXT: vmov.f32 s6, s8
990
+ ; CHECK-NEXT: vshr.s32 q3, q3, #16
991
+ ; CHECK-NEXT: vmov.f32 s7, s10
992
+ ; CHECK-NEXT: vmul.i32 q3, q3, q4
993
+ ; CHECK-NEXT: vshr.s32 q1, q1, #16
994
+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
995
+ ; CHECK-NEXT: vmul.i32 q0, q1, q0
996
+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
997
+ ; CHECK-NEXT: vmovnt.i32 q0, q3
998
+ ; CHECK-NEXT: vpop {d8, d9}
999
+ ; CHECK-NEXT: bx lr
1000
+ entry:
1001
+ %s0s = sext <8 x i16 > %s0 to <8 x i32 >
1002
+ %s1s = ashr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1003
+ %m = mul <8 x i32 > %s1s , %s0s
1004
+ %s = ashr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1005
+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
1006
+ ret <8 x i16 > %s2
1007
+ }
1008
+
1009
+ define arm_aapcs_vfpcc <8 x i16 > @vmulhu_kbc_v8i16 (<8 x i16 > %s0 , <8 x i32 > %s1 ) {
1010
+ ; CHECK-LABEL: vmulhu_kbc_v8i16:
1011
+ ; CHECK: @ %bb.0: @ %entry
1012
+ ; CHECK-NEXT: .vsave {d8, d9}
1013
+ ; CHECK-NEXT: vpush {d8, d9}
1014
+ ; CHECK-NEXT: vmov.f32 s12, s5
1015
+ ; CHECK-NEXT: vmovlt.u16 q4, q0
1016
+ ; CHECK-NEXT: vmov.f32 s13, s7
1017
+ ; CHECK-NEXT: vmovlb.u16 q0, q0
1018
+ ; CHECK-NEXT: vmov.f32 s5, s6
1019
+ ; CHECK-NEXT: vmov.f32 s14, s9
1020
+ ; CHECK-NEXT: vmov.f32 s15, s11
1021
+ ; CHECK-NEXT: vmov.f32 s6, s8
1022
+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
1023
+ ; CHECK-NEXT: vmov.f32 s7, s10
1024
+ ; CHECK-NEXT: vmul.i32 q3, q3, q4
1025
+ ; CHECK-NEXT: vshr.u32 q1, q1, #16
1026
+ ; CHECK-NEXT: vshr.u32 q3, q3, #16
1027
+ ; CHECK-NEXT: vmul.i32 q0, q1, q0
1028
+ ; CHECK-NEXT: vshr.u32 q0, q0, #16
1029
+ ; CHECK-NEXT: vmovnt.i32 q0, q3
1030
+ ; CHECK-NEXT: vpop {d8, d9}
1031
+ ; CHECK-NEXT: bx lr
1032
+ entry:
1033
+ %s0s = zext <8 x i16 > %s0 to <8 x i32 >
1034
+ %s1s = lshr <8 x i32 > %s1 , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1035
+ %m = mul <8 x i32 > %s1s , %s0s
1036
+ %s = lshr <8 x i32 > %m , <i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 , i32 16 >
1037
+ %s2 = trunc <8 x i32 > %s to <8 x i16 >
1038
+ ret <8 x i16 > %s2
1039
+ }
0 commit comments