Skip to content

Commit 6af69c2

Browse files
committed
avx512bw: Optimize shift_left and shift_right.
This patch picks the instructions from avx512vbmi for the fast path. Masking is faster than an additional AND instruction.
1 parent bae9d19 commit 6af69c2

File tree

1 file changed

+8
-20
lines changed

1 file changed

+8
-20
lines changed

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -497,22 +497,16 @@ namespace xsimd
497497
{
498498
return { (Is >= N ? Is - N : 0)... };
499499
}
500-
template <size_t N, size_t... Is>
501-
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
502-
{
503-
return { (Is >= N ? 0xFFFF : 0x0000)... };
504-
}
505500
}
506501

507502
template <size_t N, class A, class T>
508503
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
509504
{
510-
constexpr unsigned BitCount = N * 8;
511-
if (BitCount == 0)
505+
if (N == 0)
512506
{
513507
return x;
514508
}
515-
if (BitCount >= 512)
509+
if (N >= 64)
516510
{
517511
return batch<T, A>(T(0));
518512
}
@@ -538,9 +532,9 @@ namespace xsimd
538532
{
539533
xx = x;
540534
}
535+
__mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31);
541536
alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
542-
alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
543-
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
537+
return _mm512_maskz_permutexvar_epi16(mask, _mm512_load_epi32(slide_pattern.data()), xx);
544538
}
545539

546540
// slide_right
@@ -557,21 +551,15 @@ namespace xsimd
557551
{
558552
return { (Is < (32 - N) ? Is + N : 0)... };
559553
}
560-
template <size_t N, size_t... Is>
561-
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
562-
{
563-
return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
564-
}
565554
}
566555
template <size_t N, class A, class T>
567556
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
568557
{
569-
constexpr unsigned BitCount = N * 8;
570-
if (BitCount == 0)
558+
if (N == 0)
571559
{
572560
return x;
573561
}
574-
if (BitCount >= 512)
562+
if (N >= 64)
575563
{
576564
return batch<T, A>(T(0));
577565
}
@@ -590,9 +578,9 @@ namespace xsimd
590578
{
591579
xx = x;
592580
}
581+
__mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31);
593582
alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
594-
alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
595-
return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
583+
return _mm512_maskz_permutexvar_epi16(mask, _mm512_load_epi32(slide_pattern.data()), xx);
596584
}
597585

598586
// ssub

0 commit comments

Comments
 (0)