xtensor-stack
diff --git a/‎docs/source/api/batch_manip.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/api/batch_manip.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/xsimd/arch/generic/xsimd_generic_math.hpp‎
Lines changed: 1 addition & 1 deletion b/‎include/xsimd/arch/generic/xsimd_generic_math.hpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/xsimd/arch/generic/xsimd_generic_memory.hpp‎
Lines changed: 13 additions & 13 deletions b/‎include/xsimd/arch/generic/xsimd_generic_memory.hpp‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 15 additions & 15 deletions b/‎include/xsimd/arch/xsimd_avx.hpp‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 8 additions & 8 deletions b/‎include/xsimd/arch/xsimd_avx2.hpp‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎include/xsimd/arch/xsimd_avx512bw.hpp‎
Lines changed: 4 additions & 4 deletions b/‎include/xsimd/arch/xsimd_avx512bw.hpp‎
Lines changed: 4 additions & 4 deletions
@@ -16,7 +16,7 @@ Conditional expression
 .. doxygenfunction:: select(batch_bool<T, A> const &cond, batch<T, A> const &true_br, batch<T, A> const &false_br) noexcept
    :project: xsimd
 
-.. doxygenfunction:: select(batch_bool_constant<batch<T, A>, Values...> const &cond, batch<T, A> const &true_br, batch<T, A> const &false_br) noexcept
+.. doxygenfunction:: select(batch_bool_constant<T, A, Values...> const &cond, batch<T, A> const &true_br, batch<T, A> const &false_br) noexcept
    :project: xsimd
 
 
 
@@ -2064,7 +2064,7 @@ namespace xsimd
             inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
             {
                 using index_type = as_unsigned_integer_t<T>;
-                batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
+                batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());
                 return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
             }
         }
 
@@ -21,10 +21,10 @@
 
 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     namespace kernel
@@ -180,7 +180,7 @@ namespace xsimd
                 }
             };
             batch<T, A> tmp(val);
-            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+            return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
         }
 
         // get
@@ -295,7 +295,7 @@ namespace xsimd
                 }
             };
 
-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
         }
 
         template <size_t N, class A, class T>
@@ -316,7 +316,7 @@ namespace xsimd
                 }
             };
 
-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
         }
 
         template <size_t N, class A, class T>
@@ -455,19 +455,19 @@ namespace xsimd
         }
 
         template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
         {
             constexpr size_t bsize = sizeof...(Indices);
 
             // Detect common patterns
             XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
             {
-                return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
+                return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
             }
 
             XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
             {
-                return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
+                return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
             }
 
             XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
@@ -482,7 +482,7 @@ namespace xsimd
 
             XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
             {
-                return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
+                return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
             }
 
 #if defined(__has_builtin)
@@ -503,9 +503,9 @@ namespace xsimd
 #else
             // Use a generic_pattern. It is suboptimal but clang optimizes this
             // pretty well.
-            batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
+            batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
             return select(select_x_lane, x_lane, y_lane);
 #endif
         }
@@ -542,7 +542,7 @@ namespace xsimd
 
         // swizzle
         template <class A, class T, class ITy, ITy... Vs>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
         {
             return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
         }
 
@@ -1161,22 +1161,22 @@ namespace xsimd
             return detail::merge_sse(res_low, res_hi);
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
         }
 
         template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
         {
-            constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
             return _mm256_blend_ps(false_br, true_br, mask);
         }
 
         template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
         {
-            constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
             return _mm256_blend_pd(false_br, true_br, mask);
         }
 
@@ -1238,7 +1238,7 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
             // shuffle within lane
@@ -1253,7 +1253,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
         {
             constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
             // shuffle within lane
@@ -1504,7 +1504,7 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
@@ -1514,22 +1514,22 @@ namespace xsimd
             __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
 
             // normalize mask
-            batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+            batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
 
             // permute within each lane
             __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
             __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
 
             // mask to choose the right lane
-            batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+            batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
 
             // blend the two permutes
             constexpr auto mask = blend_mask.mask();
             return _mm256_blend_ps(r0, r1, mask);
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
@@ -1539,14 +1539,14 @@ namespace xsimd
             __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
 
             // normalize mask
-            batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+            batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
 
             // permute within each lane
             __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
             __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
 
             // mask to choose the right lane
-            batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+            batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
 
             // blend the two permutes
             constexpr auto mask = blend_mask.mask();
@@ -1564,7 +1564,7 @@ namespace xsimd
                   uint32_t V7,
                   detail::enable_sized_integral_t<T, 4> = 0>
         inline batch<T, A> swizzle(batch<T, A> const& self,
-                                   batch_constant<batch<uint32_t, A>,
+                                   batch_constant<uint32_t, A,
                                                   V0,
                                                   V1,
                                                   V2,
@@ -1588,7 +1588,7 @@ namespace xsimd
                   detail::enable_sized_integral_t<T, 8> = 0>
         inline batch<T, A>
         swizzle(batch<T, A> const& self,
-                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+                batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
                 requires_arch<avx>) noexcept
         {
             return bitwise_cast<T>(
 
@@ -729,9 +729,9 @@ namespace xsimd
             }
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
             // FIXME: for some reason mask here is not considered as an immediate,
             // but it's okay for _mm256_blend_epi32
             // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
@@ -912,36 +912,36 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
             return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_pd(self, mask);
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_epi64(self, mask);
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
         }
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
             return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
         }
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
         }
 
@@ -617,25 +617,25 @@ namespace xsimd
 
         // swizzle (static version)
         template <class A, uint16_t... Vs>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
             return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
         }
 
         template <class A, uint16_t... Vs>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
             return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
         }
 
         template <class A, uint8_t... Vs>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
             return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
         }
 
         template <class A, uint8_t... Vs>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
             return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
         }
Original file line number	Diff line number	Diff line change
`@@ -2064,7 +2064,7 @@ namespace xsimd`
`2064`	`2064`	`inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept`
`2065`	`2065`	`{`
`2066`	`2066`	`using index_type = as_unsigned_integer_t<T>;`
`2067`		`- batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());`
	`2067`	`+ batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());`
`2068`	`2068`	`return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());`
`2069`	`2069`	`}`
`2070`	`2070`	`}`
Original file line number	Diff line number	Diff line change
`@@ -21,10 +21,10 @@`
`21`	`21`
`22`	`22`	`namespace xsimd`
`23`	`23`	`{`
`24`		`- template <class batch_type, typename batch_type::value_type... Values>`
	`24`	`+ template <typename T, class A, T... Values>`
`25`	`25`	`struct batch_constant;`
`26`	`26`
`27`		`- template <class batch_type, bool... Values>`
	`27`	`+ template <typename T, class A, bool... Values>`
`28`	`28`	`struct batch_bool_constant;`
`29`	`29`
`30`	`30`	`namespace kernel`
`@@ -180,7 +180,7 @@ namespace xsimd`
`180`	`180`	`}`
`181`	`181`	`};`
`182`	`182`	`batch<T, A> tmp(val);`
`183`		`- return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);`
	`183`	`+ return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);`
`184`	`184`	`}`
`185`	`185`
`186`	`186`	`// get`
`@@ -295,7 +295,7 @@ namespace xsimd`
`295`	`295`	`}`
`296`	`296`	`};`
`297`	`297`
`298`		`- return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});`
	`298`	`+ return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});`
`299`	`299`	`}`
`300`	`300`
`301`	`301`	`template <size_t N, class A, class T>`
`@@ -316,7 +316,7 @@ namespace xsimd`
`316`	`316`	`}`
`317`	`317`	`};`
`318`	`318`
`319`		`- return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});`
	`319`	`+ return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});`
`320`	`320`	`}`
`321`	`321`
`322`	`322`	`template <size_t N, class A, class T>`
`@@ -455,19 +455,19 @@ namespace xsimd`
`455`	`455`	`}`
`456`	`456`
`457`	`457`	`template <class A, typename T, typename ITy, ITy... Indices>`
`458`		`- inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept`
	`458`	`+ inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept`
`459`	`459`	`{`
`460`	`460`	`constexpr size_t bsize = sizeof...(Indices);`
`461`	`461`
`462`	`462`	`// Detect common patterns`
`463`	`463`	`XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))`
`464`	`464`	`{`
`465`		`- return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());`
	`465`	`+ return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());`
`466`	`466`	`}`
`467`	`467`
`468`	`468`	`XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))`
`469`	`469`	`{`
`470`		`- return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());`
	`470`	`+ return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());`
`471`	`471`	`}`
`472`	`472`
`473`	`473`	`XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))`
`@@ -482,7 +482,7 @@ namespace xsimd`
`482`	`482`
`483`	`483`	`XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))`
`484`	`484`	`{`
`485`		`- return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);`
	`485`	`+ return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);`
`486`	`486`	`}`
`487`	`487`
`488`	`488`	`#if defined(__has_builtin)`
`@@ -503,9 +503,9 @@ namespace xsimd`
`503`	`503`	`#else`
`504`	`504`	`// Use a generic_pattern. It is suboptimal but clang optimizes this`
`505`	`505`	`// pretty well.`
`506`		`- batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());`
`507`		`- batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());`
`508`		`- batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;`
	`506`	`+ batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());`
	`507`	`+ batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());`
	`508`	`+ batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;`
`509`	`509`	`return select(select_x_lane, x_lane, y_lane);`
`510`	`510`	`#endif`
`511`	`511`	`}`
`@@ -542,7 +542,7 @@ namespace xsimd`
`542`	`542`
`543`	`543`	`// swizzle`
`544`	`544`	`template <class A, class T, class ITy, ITy... Vs>`
`545`		`- inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept`
	`545`	`+ inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept`
`546`	`546`	`{`
`547`	`547`	`return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };`
`548`	`548`	`}`
Original file line number	Diff line number	Diff line change
`@@ -729,9 +729,9 @@ namespace xsimd`
`729`	`729`	`}`
`730`	`730`	`}`
`731`	`731`	`template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>`
`732`		`- inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept`
	`732`	`+ inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept`
`733`	`733`	`{`
`734`		`- constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();`
	`734`	`+ constexpr int mask = batch_bool_constant<T, A, Values...>::mask();`
`735`	`735`	`// FIXME: for some reason mask here is not considered as an immediate,`
`736`	`736`	`// but it's okay for _mm256_blend_epi32`
`737`	`737`	`// case 2: return _mm256_blend_epi16(false_br, true_br, mask);`
`@@ -912,36 +912,36 @@ namespace xsimd`
`912`	`912`
`913`	`913`	`// swizzle (constant mask)`
`914`	`914`	`template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>`
`915`		`- inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept`
	`915`	`+ inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept`
`916`	`916`	`{`
`917`	`917`	`return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);`
`918`	`918`	`}`
`919`	`919`
`920`	`920`	`template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>`
`921`		`- inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept`
	`921`	`+ inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept`
`922`	`922`	`{`
`923`	`923`	`constexpr auto mask = detail::shuffle(V0, V1, V2, V3);`
`924`	`924`	`return _mm256_permute4x64_pd(self, mask);`
`925`	`925`	`}`
`926`	`926`
`927`	`927`	`template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>`
`928`		`- inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept`
	`928`	`+ inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept`
`929`	`929`	`{`
`930`	`930`	`constexpr auto mask = detail::shuffle(V0, V1, V2, V3);`
`931`	`931`	`return _mm256_permute4x64_epi64(self, mask);`
`932`	`932`	`}`
`933`	`933`	`template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>`
`934`		`- inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept`
	`934`	`+ inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept`
`935`	`935`	`{`
`936`	`936`	`return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));`
`937`	`937`	`}`
`938`	`938`	`template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>`
`939`		`- inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept`
	`939`	`+ inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept`
`940`	`940`	`{`
`941`	`941`	`return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);`
`942`	`942`	`}`
`943`	`943`	`template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>`
`944`		`- inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept`
	`944`	`+ inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept`
`945`	`945`	`{`
`946`	`946`	`return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));`
`947`	`947`	`}`
Original file line number	Diff line number	Diff line change
`@@ -617,25 +617,25 @@ namespace xsimd`
`617`	`617`
`618`	`618`	`// swizzle (static version)`
`619`	`619`	`template <class A, uint16_t... Vs>`
`620`		`- inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept`
	`620`	`+ inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept`
`621`	`621`	`{`
`622`	`622`	`return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});`
`623`	`623`	`}`
`624`	`624`
`625`	`625`	`template <class A, uint16_t... Vs>`
`626`		`- inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept`
	`626`	`+ inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept`
`627`	`627`	`{`
`628`	`628`	`return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});`
`629`	`629`	`}`
`630`	`630`
`631`	`631`	`template <class A, uint8_t... Vs>`
`632`		`- inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept`
	`632`	`+ inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept`
`633`	`633`	`{`
`634`	`634`	`return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});`
`635`	`635`	`}`
`636`	`636`
`637`	`637`	`template <class A, uint8_t... Vs>`
`638`		`- inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept`
	`638`	`+ inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept`
`639`	`639`	`{`
`640`	`640`	`return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});`
`641`	`641`	`}`