Skip to content

Commit 1d957d8

Browse files
committed
avx512bw,vbmi: Use make_batch_constant.
Instead of loading from an aligned array on the stack. So we yield the `set` instead of `load` intrinsic, which makes it easier for the compiler to constant fold this parts. Sadly, MSVC needs this....
1 parent 6af69c2 commit 1d957d8

File tree

2 files changed

+52
-36
lines changed

2 files changed

+52
-36
lines changed

include/xsimd/arch/xsimd_avx512bw.hpp

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -486,17 +486,22 @@ namespace xsimd
486486
// slide_left
487487
namespace detail
488488
{
489-
template <size_t... Is>
490-
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
489+
struct make_slide_perm_hi
491490
{
492-
return { (Is == 0 ? 8 : Is - 1)... };
493-
}
491+
static constexpr uint64_t get(size_t i, size_t)
492+
{
493+
return i == 0 ? 8 : i - 1;
494+
}
495+
};
494496

495-
template <size_t N, size_t... Is>
496-
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
497+
template <size_t N>
498+
struct make_slide_left_pattern
497499
{
498-
return { (Is >= N ? Is - N : 0)... };
499-
}
500+
static constexpr uint16_t get(size_t i, size_t)
501+
{
502+
return i >= N ? i - N : 0;
503+
}
504+
};
500505
}
501506

502507
template <size_t N, class A, class T>
@@ -520,10 +525,10 @@ namespace xsimd
520525
buffer[0] = buffer[0] << 8;
521526
xx = _mm512_load_epi64(&buffer[0]);
522527

523-
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
528+
auto slide_perm = xsimd::make_batch_constant<uint64_t, detail::make_slide_perm_hi, A>();
524529
__m512i xl = _mm512_slli_epi64(x, 8);
525530
__m512i xr = _mm512_srli_epi64(x, 56);
526-
xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
531+
xr = _mm512_permutex2var_epi64(xr, slide_perm.as_batch(), _mm512_setzero_si512());
527532
xx = _mm512_or_si512(xr, xl);
528533
if (N == 1)
529534
return xx;
@@ -533,24 +538,29 @@ namespace xsimd
533538
xx = x;
534539
}
535540
__mmask32 mask = 0xFFFFFFFFu << ((N / 2) & 31);
536-
alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
537-
return _mm512_maskz_permutexvar_epi16(mask, _mm512_load_epi32(slide_pattern.data()), xx);
541+
auto slide_pattern = xsimd::make_batch_constant<uint16_t, detail::make_slide_left_pattern<N / 2>, A>();
542+
return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx);
538543
}
539544

540545
// slide_right
541546
namespace detail
542547
{
543-
template <size_t... Is>
544-
constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
548+
struct make_slide_perm_low
545549
{
546-
return { (Is + 1)... };
547-
}
550+
static constexpr uint64_t get(size_t i, size_t)
551+
{
552+
return i + 1;
553+
}
554+
};
548555

549-
template <size_t N, size_t... Is>
550-
constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
556+
template <size_t N>
557+
struct make_slide_right_pattern
551558
{
552-
return { (Is < (32 - N) ? Is + N : 0)... };
553-
}
559+
static constexpr uint16_t get(size_t i, size_t n)
560+
{
561+
return i < (n - N) ? i + N : 0;
562+
}
563+
};
554564
}
555565
template <size_t N, class A, class T>
556566
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
@@ -566,10 +576,10 @@ namespace xsimd
566576
batch<T, A> xx;
567577
if (N & 1)
568578
{
569-
alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
579+
auto slide_perm = xsimd::make_batch_constant<uint64_t, detail::make_slide_perm_low, A>();
570580
__m512i xr = _mm512_srli_epi64(x, 8);
571581
__m512i xl = _mm512_slli_epi64(x, 56);
572-
xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
582+
xl = _mm512_permutex2var_epi64(xl, slide_perm.as_batch(), _mm512_setzero_si512());
573583
xx = _mm512_or_si512(xr, xl);
574584
if (N == 1)
575585
return xx;
@@ -579,8 +589,8 @@ namespace xsimd
579589
xx = x;
580590
}
581591
__mmask32 mask = 0xFFFFFFFFu >> ((N / 2) & 31);
582-
alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
583-
return _mm512_maskz_permutexvar_epi16(mask, _mm512_load_epi32(slide_pattern.data()), xx);
592+
auto slide_pattern = xsimd::make_batch_constant<uint16_t, detail::make_slide_right_pattern<N / 2>, A>();
593+
return _mm512_maskz_permutexvar_epi16(mask, slide_pattern.as_batch(), xx);
584594
}
585595

586596
// ssub

include/xsimd/arch/xsimd_avx512vbmi.hpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,23 @@ namespace xsimd
2626

2727
namespace detail
2828
{
29-
template <size_t N, size_t... Is>
30-
constexpr std::array<uint8_t, sizeof...(Is)> make_slide_left_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
29+
template <size_t N>
30+
struct make_slide_left_bytes_pattern
3131
{
32-
return { (Is >= N ? Is - N : 0)... };
33-
}
32+
static constexpr uint8_t get(size_t i, size_t)
33+
{
34+
return i >= N ? i - N : 0;
35+
}
36+
};
3437

35-
template <size_t N, size_t... Is>
36-
constexpr std::array<uint8_t, sizeof...(Is)> make_slide_right_bytes_pattern(::xsimd::detail::index_sequence<Is...>)
38+
template <size_t N>
39+
struct make_slide_right_bytes_pattern
3740
{
38-
return { (Is < (64 - N) ? Is + N : 0)... };
39-
}
41+
static constexpr uint8_t get(size_t i, size_t n)
42+
{
43+
return i < (n - N) ? i + N : 0;
44+
}
45+
};
4046
}
4147

4248
// slide_left
@@ -53,8 +59,8 @@ namespace xsimd
5359
}
5460

5561
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull << (N & 63);
56-
alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_bytes_pattern<N>(::xsimd::detail::make_index_sequence<512 / 8>());
57-
return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x);
62+
auto slide_pattern = xsimd::make_batch_constant<uint8_t, detail::make_slide_left_bytes_pattern<N>, A>();
63+
return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
5864
}
5965

6066
// slide_right
@@ -70,8 +76,8 @@ namespace xsimd
7076
return batch<T, A>(T(0));
7177
}
7278
__mmask64 mask = 0xFFFFFFFFFFFFFFFFull >> (N & 63);
73-
alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_bytes_pattern<N>(::xsimd::detail::make_index_sequence<512 / 8>());
74-
return _mm512_maskz_permutexvar_epi8(mask, _mm512_load_epi32(slide_pattern.data()), x);
79+
auto slide_pattern = xsimd::make_batch_constant<uint8_t, detail::make_slide_right_bytes_pattern<N>, A>();
80+
return _mm512_maskz_permutexvar_epi8(mask, slide_pattern.as_batch(), x);
7581
}
7682

7783
// swizzle (dynamic version)

0 commit comments

Comments
 (0)