Skip to content

Commit ead0742

Browse files
Use dedicated instructions for zip / unzip on arm64
1 parent b6868c2 commit ead0742

File tree

1 file changed

+83
-0
lines changed

1 file changed

+83
-0
lines changed

include/xsimd/arch/xsimd_neon64.hpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -952,6 +952,41 @@ namespace xsimd
952952
/**********
953953
* zip_lo *
954954
**********/
955+
template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
956+
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
957+
{
958+
return vzip1q_u8(lhs, rhs);
959+
}
960+
961+
template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
962+
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
963+
{
964+
return vzip1q_s8(lhs, rhs);
965+
}
966+
967+
template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
968+
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
969+
{
970+
return vzip1q_u16(lhs, rhs);
971+
}
972+
973+
template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
974+
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
975+
{
976+
return vzip1q_s16(lhs, rhs);
977+
}
978+
979+
template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
980+
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
981+
{
982+
return vzip1q_u32(lhs, rhs);
983+
}
984+
985+
template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
986+
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
987+
{
988+
return vzip1q_s32(lhs, rhs);
989+
}
955990

956991
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
957992
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
@@ -965,6 +1000,12 @@ namespace xsimd
9651000
return vzip1q_s64(lhs, rhs);
9661001
}
9671002

1003+
template <class A>
1004+
inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
1005+
{
1006+
return vzip1q_f32(lhs, rhs);
1007+
}
1008+
9681009
template <class A>
9691010
inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
9701011
{
@@ -975,6 +1016,42 @@ namespace xsimd
9751016
* zip_hi *
9761017
**********/
9771018

1019+
template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
1020+
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1021+
{
1022+
return vzip2q_u8(lhs, rhs);
1023+
}
1024+
1025+
template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
1026+
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1027+
{
1028+
return vzip2q_s8(lhs, rhs);
1029+
}
1030+
1031+
template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
1032+
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1033+
{
1034+
return vzip2q_u16(lhs, rhs);
1035+
}
1036+
1037+
template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
1038+
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1039+
{
1040+
return vzip2q_s16(lhs, rhs);
1041+
}
1042+
1043+
template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
1044+
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1045+
{
1046+
return vzip2q_u32(lhs, rhs);
1047+
}
1048+
1049+
template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
1050+
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
1051+
{
1052+
return vzip2q_s32(lhs, rhs);
1053+
}
1054+
9781055
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
9791056
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
9801057
{
@@ -987,6 +1064,12 @@ namespace xsimd
9871064
return vzip2q_s64(lhs, rhs);
9881065
}
9891066

1067+
template <class A>
1068+
inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
1069+
{
1070+
return vzip2q_f32(lhs, rhs);
1071+
}
1072+
9901073
template <class A>
9911074
inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
9921075
{

0 commit comments

Comments
 (0)