1919
2020namespace xsimd
2121{
22+ template <class batch_type , bool ... Values>
23+ struct batch_bool_constant ;
24+
25+ template <class T_out , class T_in , class A >
26+ inline batch<T_out, A> bitwise_cast (batch<T_in, A> const & x) noexcept ;
27+
28+ template <class batch_type , typename batch_type::value_type... Values>
29+ struct batch_constant ;
2230
2331 namespace kernel
2432 {
2533 using namespace types ;
2634
35+ // fwd
36+ template <class A , class T , size_t I>
37+ inline batch<T, A> insert (batch<T, A> const & self, T val, index<I>, requires_arch<generic>) noexcept ;
38+ template <class A , typename T, typename ITy, ITy... Indices>
39+ inline batch<T, A> shuffle (batch<T, A> const & x, batch<T, A> const & y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept ;
40+
2741 // abs
2842 template <class A , class T , typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void >::type>
2943 inline batch<T, A> abs (batch<T, A> const & self, requires_arch<wasm>) noexcept
@@ -136,6 +150,13 @@ namespace xsimd
136150 return wasm_i8x16_bitmask (self) != 0 ;
137151 }
138152
153+ // batch_bool_cast
154+ template <class A , class T_out , class T_in >
155+ inline batch_bool<T_out, A> batch_bool_cast (batch_bool<T_in, A> const & self, batch_bool<T_out, A> const &, requires_arch<wasm>) noexcept
156+ {
157+ return { bitwise_cast<T_out>(batch<T_in, A>(self.data )).data };
158+ }
159+
139160 // bitwise_and
140161 template <class A , class T >
141162 inline batch<T, A> bitwise_and (batch<T, A> const & self, batch<T, A> const & other, requires_arch<wasm>) noexcept
@@ -162,6 +183,13 @@ namespace xsimd
162183 return wasm_v128_andnot (self, other);
163184 }
164185
186+ // bitwise_cast
187+ template <class A , class T , class Tp >
188+ inline batch<Tp, A> bitwise_cast (batch<T, A> const & self, batch<Tp, A> const &, requires_arch<wasm>) noexcept
189+ {
190+ return batch<Tp, A>(self.data );
191+ }
192+
165193 // bitwise_or
166194 template <class A , class T >
167195 inline batch<T, A> bitwise_or (batch<T, A> const & self, batch<T, A> const & other, requires_arch<wasm>) noexcept
@@ -415,6 +443,53 @@ namespace xsimd
415443 return wasm_f64x2_eq (self, other);
416444 }
417445
446+ // fast_cast
447+ namespace detail
448+ {
449+ template <class A >
450+ inline batch<float , A> fast_cast (batch<int32_t , A> const & self, batch<float , A> const &, requires_arch<wasm>) noexcept
451+ {
452+ return wasm_f32x4_convert_i32x4 (self);
453+ }
454+
455+ template <class A >
456+ inline batch<double , A> fast_cast (batch<uint64_t , A> const & x, batch<double , A> const &, requires_arch<wasm>) noexcept
457+ {
458+ // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
459+ // adapted to wasm
460+ v128_t xH = wasm_u64x2_shr (x, 32 );
461+ xH = wasm_v128_or (xH, wasm_f64x2_splat (19342813113834066795298816 .)); // 2^84
462+ v128_t mask = wasm_i16x8_make (0xFFFF , 0xFFFF , 0x0000 , 0x0000 , 0xFFFF , 0xFFFF , 0x0000 , 0x0000 );
463+ v128_t xL = wasm_v128_or (wasm_v128_and (mask, x), wasm_v128_andnot (wasm_f64x2_splat (0x0010000000000000 ), mask)); // 2^52
464+ v128_t f = wasm_f64x2_sub (xH, wasm_f64x2_splat (19342813118337666422669312 .)); // 2^84 + 2^52
465+ return wasm_f64x2_add (f, xL);
466+ }
467+
468+ template <class A >
469+ inline batch<double , A> fast_cast (batch<int64_t , A> const & x, batch<double , A> const &, requires_arch<wasm>) noexcept
470+ {
471+ // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
472+ // adapted to wasm
473+ v128_t xH = wasm_i32x4_shr (x, 16 );
474+ xH = wasm_v128_and (xH, wasm_i16x8_make (0x0000 , 0x0000 , 0xFFFF , 0xFFFF , 0x0000 , 0x0000 , 0xFFFF , 0xFFFF ));
475+ xH = wasm_i64x2_add (xH, wasm_f64x2_splat (442721857769029238784 .)); // 3*2^67
476+ v128_t mask = wasm_i16x8_make (0xFFFF , 0xFFFF , 0xFFFF , 0x0000 , 0xFFFF , 0xFFFF , 0xFFFF , 0x0000 );
477+ v128_t xL = wasm_v128_or (wasm_v128_and (mask, x), wasm_v128_andnot (wasm_f64x2_splat (0x0010000000000000 ), mask)); // 2^52
478+ v128_t f = wasm_f64x2_sub (xH, wasm_f64x2_splat (442726361368656609280 .)); // 3*2^67 + 2^52
479+ return wasm_f64x2_add (f, xL);
480+ }
481+
482+ template <class A >
483+ inline batch<int32_t , A> fast_cast (batch<float , A> const & self, batch<int32_t , A> const &, requires_arch<wasm>) noexcept
484+ {
485+ return wasm_i32x4_make (
486+ static_cast <int32_t >(wasm_f32x4_extract_lane (self, 0 )),
487+ static_cast <int32_t >(wasm_f32x4_extract_lane (self, 1 )),
488+ static_cast <int32_t >(wasm_f32x4_extract_lane (self, 2 )),
489+ static_cast <int32_t >(wasm_f32x4_extract_lane (self, 3 )));
490+ }
491+ }
492+
418493 // floor
419494 template <class A >
420495 inline batch<float , A> floor (batch<float , A> const & self, requires_arch<wasm>) noexcept
@@ -516,11 +591,11 @@ namespace xsimd
516591 }
517592 else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
518593 {
519- return from_mask (batch_bool<float , A> {}, mask, wasm {});
594+ return batch_bool_cast<T>( from_mask (batch_bool<float , A> {}, mask, wasm {}) );
520595 }
521596 else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
522597 {
523- return from_mask (batch_bool<double , A> {}, mask, wasm {});
598+ return batch_bool_cast<T>( from_mask (batch_bool<double , A> {}, mask, wasm {}) );
524599 }
525600 }
526601
@@ -1039,6 +1114,44 @@ namespace xsimd
10391114 return wasm_f64x2_extract_lane (tmp2, 0 );
10401115 }
10411116
1117+ // reduce_max
1118+ template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
1119+ inline T reduce_max (batch<T, A> const & self, requires_arch<wasm>) noexcept
1120+ {
1121+ batch<T, A> step0 = wasm_i32x4_shuffle (self, wasm_i32x4_splat (0 ), 2 , 3 , 0 , 0 );
1122+ batch<T, A> acc0 = max (self, step0);
1123+
1124+ batch<T, A> step1 = wasm_i32x4_shuffle (self, wasm_i32x4_splat (0 ), 1 , 0 , 0 , 0 );
1125+ batch<T, A> acc1 = max (acc0, step1);
1126+
1127+ batch<T, A> step2 = wasm_i16x8_shuffle (acc1, wasm_i16x8_splat (0 ), 1 , 0 , 0 , 0 , 4 , 5 , 6 , 7 );
1128+ batch<T, A> acc2 = max (acc1, step2);
1129+ if (sizeof (T) == 2 )
1130+ return acc2.get (0 );
1131+ batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t >(acc2) >> 8 );
1132+ batch<T, A> acc3 = max (acc2, step3);
1133+ return acc3.get (0 );
1134+ }
1135+
1136+ // reduce_min
1137+ template <class A , class T , class _ = typename std::enable_if<(sizeof (T) <= 2 ), void >::type>
1138+ inline T reduce_min (batch<T, A> const & self, requires_arch<wasm>) noexcept
1139+ {
1140+ batch<T, A> step0 = wasm_i32x4_shuffle (self, wasm_i32x4_splat (0 ), 2 , 3 , 0 , 0 );
1141+ batch<T, A> acc0 = min (self, step0);
1142+
1143+ batch<T, A> step1 = wasm_i32x4_shuffle (self, wasm_i32x4_splat (0 ), 1 , 0 , 0 , 0 );
1144+ batch<T, A> acc1 = min (acc0, step1);
1145+
1146+ batch<T, A> step2 = wasm_i16x8_shuffle (acc1, wasm_i16x8_splat (0 ), 1 , 0 , 0 , 0 , 4 , 5 , 6 , 7 );
1147+ batch<T, A> acc2 = min (acc1, step2);
1148+ if (sizeof (T) == 2 )
1149+ return acc2.get (0 );
1150+ batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t >(acc2) >> 8 );
1151+ batch<T, A> acc3 = min (acc2, step3);
1152+ return acc3.get (0 );
1153+ }
1154+
10421155 // rsqrt
10431156 template <class A >
10441157 inline batch<float , A> rsqrt (batch<float , A> const & self, requires_arch<wasm>) noexcept
@@ -1144,6 +1257,33 @@ namespace xsimd
11441257 return wasm_v128_or (wasm_v128_and (cond, true_br), wasm_v128_andnot (false_br, cond));
11451258 }
11461259
1260+ // shuffle
1261+ template <class A , class ITy , ITy I0, ITy I1, ITy I2, ITy I3>
1262+ inline batch<float , A> shuffle (batch<float , A> const & x, batch<float , A> const & y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<wasm>) noexcept
1263+ {
1264+ // shuffle within lane
1265+ if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4 )
1266+ return wasm_i32x4_shuffle (x, y, I0, I1, I2, I3);
1267+
1268+ // shuffle within opposite lane
1269+ if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4 )
1270+ return wasm_i32x4_shuffle (y, x, I0, I1, I2, I3);
1271+ return shuffle (x, y, mask, generic {});
1272+ }
1273+
1274+ template <class A , class ITy , ITy I0, ITy I1>
1275+ inline batch<double , A> shuffle (batch<double , A> const & x, batch<double , A> const & y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<wasm>) noexcept
1276+ {
1277+ // shuffle within lane
1278+ if (I0 < 2 && I1 >= 2 )
1279+ return wasm_i64x2_shuffle (x, y, I0, I1);
1280+
1281+ // shuffle within opposite lane
1282+ if (I0 >= 2 && I1 < 2 )
1283+ return wasm_i64x2_shuffle (y, x, I0, I1);
1284+ return shuffle (x, y, mask, generic {});
1285+ }
1286+
11471287 // set
11481288 template <class A , class ... Values>
11491289 inline batch<float , A> set (batch<float , A> const &, requires_arch<wasm>, Values... values) noexcept
@@ -1243,25 +1383,21 @@ namespace xsimd
12431383 template <class A >
12441384 inline void store_aligned (float * mem, batch<float , A> const & self, requires_arch<wasm>) noexcept
12451385 {
1246- // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12471386 return wasm_v128_store (mem, self);
12481387 }
12491388 template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
12501389 inline void store_aligned (T* mem, batch<T, A> const & self, requires_arch<wasm>) noexcept
12511390 {
1252- // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12531391 return wasm_v128_store ((v128_t *)mem, self);
12541392 }
12551393 template <class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
12561394 inline void store_aligned (T* mem, batch_bool<T, A> const & self, requires_arch<wasm>) noexcept
12571395 {
1258- // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12591396 return wasm_v128_store ((v128_t *)mem, self);
12601397 }
12611398 template <class A >
12621399 inline void store_aligned (double * mem, batch<double , A> const & self, requires_arch<wasm>) noexcept
12631400 {
1264- // Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12651401 return wasm_v128_store (mem, self);
12661402 }
12671403
@@ -1363,6 +1499,44 @@ namespace xsimd
13631499 return wasm_f64x2_sqrt (val);
13641500 }
13651501
1502+ // swizzle
1503+
1504+ template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1505+ inline batch<float , A> swizzle (batch<float , A> const & self, batch_constant<batch<uint32_t , A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
1506+ {
1507+ return wasm_i32x4_shuffle (self, self, V0, V1, V2, V3);
1508+ }
1509+
1510+ template <class A , uint64_t V0, uint64_t V1>
1511+ inline batch<double , A> swizzle (batch<double , A> const & self, batch_constant<batch<uint64_t , A>, V0, V1>, requires_arch<wasm>) noexcept
1512+ {
1513+ return wasm_i64x2_shuffle (self, self, V0, V1);
1514+ }
1515+
1516+ template <class A , uint64_t V0, uint64_t V1>
1517+ inline batch<uint64_t , A> swizzle (batch<uint64_t , A> const & self, batch_constant<batch<uint64_t , A>, V0, V1>, requires_arch<wasm>) noexcept
1518+ {
1519+ return wasm_i32x4_shuffle (self, wasm_i32x4_splat (0 ), 2 * V0, 2 * V0 + 1 , 2 * V1, 2 * V1 + 1 );
1520+ }
1521+
1522+ template <class A , uint64_t V0, uint64_t V1>
1523+ inline batch<int64_t , A> swizzle (batch<int64_t , A> const & self, batch_constant<batch<uint64_t , A>, V0, V1> mask, requires_arch<wasm>) noexcept
1524+ {
1525+ return bitwise_cast<int64_t >(swizzle (bitwise_cast<uint64_t >(self), mask, wasm {}));
1526+ }
1527+
1528+ template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1529+ inline batch<uint32_t , A> swizzle (batch<uint32_t , A> const & self, batch_constant<batch<uint32_t , A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
1530+ {
1531+ return wasm_i32x4_shuffle (self, wasm_i32x4_splat (0 ), V0, V1, V2, V3);
1532+ }
1533+
1534+ template <class A , uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1535+ inline batch<int32_t , A> swizzle (batch<int32_t , A> const & self, batch_constant<batch<uint32_t , A>, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
1536+ {
1537+ return bitwise_cast<int32_t >(swizzle (bitwise_cast<uint32_t >(self), mask, wasm {}));
1538+ }
1539+
13661540 // trunc
13671541 template <class A >
13681542 inline batch<float , A> trunc (batch<float , A> const & self, requires_arch<wasm>) noexcept
0 commit comments