Skip to content

Commit 105658a

Browse files
authored
Merge pull request #962 from anutosh491/remaining_ops_impl
Implemented few operations for the wasm instruction set
2 parents 46c561b + b816668 commit 105658a

File tree

1 file changed

+180
-6
lines changed

1 file changed

+180
-6
lines changed

include/xsimd/arch/xsimd_wasm.hpp

Lines changed: 180 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,25 @@
1919

2020
namespace xsimd
2121
{
22+
template <class batch_type, bool... Values>
23+
struct batch_bool_constant;
24+
25+
template <class T_out, class T_in, class A>
26+
inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
27+
28+
template <class batch_type, typename batch_type::value_type... Values>
29+
struct batch_constant;
2230

2331
namespace kernel
2432
{
2533
using namespace types;
2634

35+
// fwd
36+
template <class A, class T, size_t I>
37+
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
38+
template <class A, typename T, typename ITy, ITy... Indices>
39+
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
40+
2741
// abs
2842
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
2943
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<wasm>) noexcept
@@ -136,6 +150,13 @@ namespace xsimd
136150
return wasm_i8x16_bitmask(self) != 0;
137151
}
138152

153+
// batch_bool_cast
154+
template <class A, class T_out, class T_in>
155+
inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<wasm>) noexcept
156+
{
157+
return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
158+
}
159+
139160
// bitwise_and
140161
template <class A, class T>
141162
inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
@@ -162,6 +183,13 @@ namespace xsimd
162183
return wasm_v128_andnot(self, other);
163184
}
164185

186+
// bitwise_cast
187+
template <class A, class T, class Tp>
188+
inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<wasm>) noexcept
189+
{
190+
return batch<Tp, A>(self.data);
191+
}
192+
165193
// bitwise_or
166194
template <class A, class T>
167195
inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
@@ -415,6 +443,53 @@ namespace xsimd
415443
return wasm_f64x2_eq(self, other);
416444
}
417445

446+
// fast_cast
447+
namespace detail
448+
{
449+
template <class A>
450+
inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<wasm>) noexcept
451+
{
452+
return wasm_f32x4_convert_i32x4(self);
453+
}
454+
455+
template <class A>
456+
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
457+
{
458+
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
459+
// adapted to wasm
460+
v128_t xH = wasm_u64x2_shr(x, 32);
461+
xH = wasm_v128_or(xH, wasm_f64x2_splat(19342813113834066795298816.)); // 2^84
462+
v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
463+
v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52
464+
v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(19342813118337666422669312.)); // 2^84 + 2^52
465+
return wasm_f64x2_add(f, xL);
466+
}
467+
468+
template <class A>
469+
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<wasm>) noexcept
470+
{
471+
// from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
472+
// adapted to wasm
473+
v128_t xH = wasm_i32x4_shr(x, 16);
474+
xH = wasm_v128_and(xH, wasm_i16x8_make(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
475+
xH = wasm_i64x2_add(xH, wasm_f64x2_splat(442721857769029238784.)); // 3*2^67
476+
v128_t mask = wasm_i16x8_make(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
477+
v128_t xL = wasm_v128_or(wasm_v128_and(mask, x), wasm_v128_andnot(wasm_f64x2_splat(0x0010000000000000), mask)); // 2^52
478+
v128_t f = wasm_f64x2_sub(xH, wasm_f64x2_splat(442726361368656609280.)); // 3*2^67 + 2^52
479+
return wasm_f64x2_add(f, xL);
480+
}
481+
482+
template <class A>
483+
inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<wasm>) noexcept
484+
{
485+
return wasm_i32x4_make(
486+
static_cast<int32_t>(wasm_f32x4_extract_lane(self, 0)),
487+
static_cast<int32_t>(wasm_f32x4_extract_lane(self, 1)),
488+
static_cast<int32_t>(wasm_f32x4_extract_lane(self, 2)),
489+
static_cast<int32_t>(wasm_f32x4_extract_lane(self, 3)));
490+
}
491+
}
492+
418493
// floor
419494
template <class A>
420495
inline batch<float, A> floor(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -516,11 +591,11 @@ namespace xsimd
516591
}
517592
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
518593
{
519-
return from_mask(batch_bool<float, A> {}, mask, wasm {});
594+
return batch_bool_cast<T>(from_mask(batch_bool<float, A> {}, mask, wasm {}));
520595
}
521596
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
522597
{
523-
return from_mask(batch_bool<double, A> {}, mask, wasm {});
598+
return batch_bool_cast<T>(from_mask(batch_bool<double, A> {}, mask, wasm {}));
524599
}
525600
}
526601

@@ -1039,6 +1114,44 @@ namespace xsimd
10391114
return wasm_f64x2_extract_lane(tmp2, 0);
10401115
}
10411116

1117+
// reduce_max
1118+
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1119+
inline T reduce_max(batch<T, A> const& self, requires_arch<wasm>) noexcept
1120+
{
1121+
batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
1122+
batch<T, A> acc0 = max(self, step0);
1123+
1124+
batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
1125+
batch<T, A> acc1 = max(acc0, step1);
1126+
1127+
batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
1128+
batch<T, A> acc2 = max(acc1, step2);
1129+
if (sizeof(T) == 2)
1130+
return acc2.get(0);
1131+
batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1132+
batch<T, A> acc3 = max(acc2, step3);
1133+
return acc3.get(0);
1134+
}
1135+
1136+
// reduce_min
1137+
template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
1138+
inline T reduce_min(batch<T, A> const& self, requires_arch<wasm>) noexcept
1139+
{
1140+
batch<T, A> step0 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2, 3, 0, 0);
1141+
batch<T, A> acc0 = min(self, step0);
1142+
1143+
batch<T, A> step1 = wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 1, 0, 0, 0);
1144+
batch<T, A> acc1 = min(acc0, step1);
1145+
1146+
batch<T, A> step2 = wasm_i16x8_shuffle(acc1, wasm_i16x8_splat(0), 1, 0, 0, 0, 4, 5, 6, 7);
1147+
batch<T, A> acc2 = min(acc1, step2);
1148+
if (sizeof(T) == 2)
1149+
return acc2.get(0);
1150+
batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
1151+
batch<T, A> acc3 = min(acc2, step3);
1152+
return acc3.get(0);
1153+
}
1154+
10421155
// rsqrt
10431156
template <class A>
10441157
inline batch<float, A> rsqrt(batch<float, A> const& self, requires_arch<wasm>) noexcept
@@ -1144,6 +1257,33 @@ namespace xsimd
11441257
return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
11451258
}
11461259

1260+
// shuffle
1261+
template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
1262+
inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<wasm>) noexcept
1263+
{
1264+
// shuffle within lane
1265+
if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
1266+
return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
1267+
1268+
// shuffle within opposite lane
1269+
if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
1270+
return wasm_i32x4_shuffle(y, x, I0, I1, I2, I3);
1271+
return shuffle(x, y, mask, generic {});
1272+
}
1273+
1274+
template <class A, class ITy, ITy I0, ITy I1>
1275+
inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<wasm>) noexcept
1276+
{
1277+
// shuffle within lane
1278+
if (I0 < 2 && I1 >= 2)
1279+
return wasm_i64x2_shuffle(x, y, I0, I1);
1280+
1281+
// shuffle within opposite lane
1282+
if (I0 >= 2 && I1 < 2)
1283+
return wasm_i64x2_shuffle(y, x, I0, I1);
1284+
return shuffle(x, y, mask, generic {});
1285+
}
1286+
11471287
// set
11481288
template <class A, class... Values>
11491289
inline batch<float, A> set(batch<float, A> const&, requires_arch<wasm>, Values... values) noexcept
@@ -1243,25 +1383,21 @@ namespace xsimd
12431383
template <class A>
12441384
inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<wasm>) noexcept
12451385
{
1246-
// Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12471386
return wasm_v128_store(mem, self);
12481387
}
12491388
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
12501389
inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<wasm>) noexcept
12511390
{
1252-
// Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12531391
return wasm_v128_store((v128_t*)mem, self);
12541392
}
12551393
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
12561394
inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<wasm>) noexcept
12571395
{
1258-
// Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12591396
return wasm_v128_store((v128_t*)mem, self);
12601397
}
12611398
template <class A>
12621399
inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<wasm>) noexcept
12631400
{
1264-
// Assuming that mem is aligned properly, you can use wasm_v128_store to store the batch.
12651401
return wasm_v128_store(mem, self);
12661402
}
12671403

@@ -1363,6 +1499,44 @@ namespace xsimd
13631499
return wasm_f64x2_sqrt(val);
13641500
}
13651501

1502+
// swizzle
1503+
1504+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1505+
inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
1506+
{
1507+
return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
1508+
}
1509+
1510+
template <class A, uint64_t V0, uint64_t V1>
1511+
inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
1512+
{
1513+
return wasm_i64x2_shuffle(self, self, V0, V1);
1514+
}
1515+
1516+
template <class A, uint64_t V0, uint64_t V1>
1517+
inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
1518+
{
1519+
return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
1520+
}
1521+
1522+
template <class A, uint64_t V0, uint64_t V1>
1523+
inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<wasm>) noexcept
1524+
{
1525+
return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
1526+
}
1527+
1528+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1529+
inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
1530+
{
1531+
return wasm_i32x4_shuffle(self, wasm_i32x4_splat(0), V0, V1, V2, V3);
1532+
}
1533+
1534+
template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
1535+
inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
1536+
{
1537+
return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
1538+
}
1539+
13661540
// trunc
13671541
template <class A>
13681542
inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<wasm>) noexcept

0 commit comments

Comments
 (0)