Skip to content

Commit b1284c4

Browse files
WIP
1 parent 3f6d496 commit b1284c4

File tree

3 files changed

+60
-44
lines changed

3 files changed

+60
-44
lines changed

include/xsimd/arch/common/xsimd_common_math.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1087,7 +1087,7 @@ namespace xsimd
10871087
template <class A, class T>
10881088
XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<common>) noexcept
10891089
{
1090-
return batch<T, A>(self.data) & batch<T, A>(1);
1090+
return batch<T, A>((typename batch<T, A>::register_type)self.data) & batch<T, A>(1);
10911091
}
10921092

10931093
// horner

include/xsimd/arch/xsimd_altivec.hpp

Lines changed: 55 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,16 @@ namespace xsimd
122122
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
123123
XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
124124
{
125-
constexpr auto nbit = 8 * sizeof(T) - 1;
126-
auto adj = ((self ^ other) << nbit) >> nbit;
127-
return avgr(self, other, A {}) - adj;
125+
XSIMD_IF_CONSTEXPR(sizeof(T) < 8)
126+
{
127+
constexpr auto nbit = 8 * sizeof(T) - 1;
128+
auto adj = bitwise_cast<T>(bitwise_cast<as_unsigned_integer_t<T>>((self ^ other) << nbit) >> nbit);
129+
return avgr(self, other, A {}) - adj;
130+
}
131+
else
132+
{
133+
return avg(self, other, common {});
134+
}
128135
}
129136
template <class A>
130137
XSIMD_INLINE batch<float, A> avg(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
@@ -207,7 +214,14 @@ namespace xsimd
207214
{
208215
using shift_type = as_unsigned_integer_t<T>;
209216
batch<shift_type, A> shift(static_cast<shift_type>(other));
210-
return vec_sr(self.data, shift.data);
217+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
218+
{
219+
return vec_sra(self.data, shift.data);
220+
}
221+
else
222+
{
223+
return vec_sr(self.data, shift.data);
224+
}
211225
}
212226

213227
// bitwise_xor
@@ -226,7 +240,7 @@ namespace xsimd
226240
template <class A, class T_in, class T_out>
227241
XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<altivec>) noexcept
228242
{
229-
return *reinterpret_cast<typename batch<T_out, A>::register_type const*>(&self.data);
243+
return (typename batch<T_out, A>::register_type)(self.data);
230244
}
231245

232246
// broadcast
@@ -243,43 +257,43 @@ namespace xsimd
243257
template <class A>
244258
XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<altivec>) noexcept
245259
{
246-
return vec_mergel(self.real().data, self.imag().data);
260+
return vec_mergeh(self.real().data, self.imag().data);
247261
}
248262
template <class A>
249263
XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<altivec>) noexcept
250264
{
251-
return vec_mergel(self.real().data, self.imag().data);
265+
return vec_mergeh(self.real().data, self.imag().data);
252266
}
253267
// complex_high
254268
template <class A>
255269
XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<altivec>) noexcept
256270
{
257-
return vec_mergeh(self.real().data, self.imag().data);
271+
return vec_mergel(self.real().data, self.imag().data);
258272
}
259273
template <class A>
260274
XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<altivec>) noexcept
261275
{
262-
return vec_mergeh(self.real().data, self.imag().data);
276+
return vec_mergel(self.real().data, self.imag().data);
263277
}
264278
}
265279

266280
// decr_if
267281
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
268282
XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<altivec>) noexcept
269283
{
270-
return self + batch<T, A>(mask.data);
284+
return self + batch<T, A>((typename batch<T, A>::register_type)mask.data);
271285
}
272286

273287
// div
274288
template <class A>
275289
XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
276290
{
277-
return vec_mul(self.data, vec_re(other.data));
291+
return vec_div(self.data, other.data);
278292
}
279293
template <class A>
280294
XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
281295
{
282-
return vec_mul(self.data, vec_re(other.data));
296+
return vec_div(self.data, other.data);
283297
}
284298

285299
// fast_cast
@@ -471,7 +485,7 @@ namespace xsimd
471485
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
472486
XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<altivec>) noexcept
473487
{
474-
return self - batch<T, A>(mask.data);
488+
return self - batch<T, A>((typename batch<T, A>::register_type)mask.data);
475489
}
476490

477491
// insert
@@ -504,9 +518,7 @@ namespace xsimd
504518
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
505519
XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
506520
{
507-
auto lo = vec_ld(0, reinterpret_cast<const typename batch<T, A>::register_type*>(mem));
508-
auto hi = vec_ld(16, reinterpret_cast<const typename batch<T, A>::register_type*>(mem));
509-
return vec_perm(lo, hi, vec_lvsl(0, mem));
521+
return *(typename batch<T, A>::register_type const*)mem;
510522
}
511523

512524
// load_complex
@@ -515,7 +527,9 @@ namespace xsimd
515527
template <class A>
516528
XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<altivec>) noexcept
517529
{
518-
return { vec_mergee(hi.data, lo.data), vec_mergeo(hi.data, lo.data) };
530+
__vector unsigned char perme = { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
531+
__vector unsigned char permo = { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
532+
return { vec_perm(hi.data, lo.data, perme), vec_perm(hi.data, lo.data, permo) };
519533
}
520534
template <class A>
521535
XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<altivec>) noexcept
@@ -685,7 +699,7 @@ namespace xsimd
685699
{
686700
auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
687701
auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
688-
auto tmp2 = vec_mergeh(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
702+
auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
689703
auto tmp3 = vec_add(tmp1, tmp2);
690704
return vec_extract(tmp3, 0);
691705
}
@@ -694,7 +708,7 @@ namespace xsimd
694708
{
695709
auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
696710
auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
697-
auto tmp2 = vec_mergeh(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
711+
auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
698712
auto tmp3 = vec_add(tmp1, tmp2);
699713
return vec_extract(tmp3, 0);
700714
}
@@ -704,7 +718,7 @@ namespace xsimd
704718
// FIXME: find an in-order approach
705719
auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
706720
auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
707-
auto tmp2 = vec_mergeh(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
721+
auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
708722
auto tmp3 = vec_add(tmp1, tmp2);
709723
return vec_extract(tmp3, 0);
710724
}
@@ -783,7 +797,7 @@ namespace xsimd
783797
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
784798
XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept
785799
{
786-
return vec_sel(true_br.data, false_br.data, cond.data);
800+
return vec_sel(false_br.data, true_br.data, cond.data);
787801
}
788802
template <class A, class T, bool... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
789803
XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept
@@ -844,14 +858,29 @@ namespace xsimd
844858
template <size_t N, class A, class T>
845859
XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<altivec>) noexcept
846860
{
847-
return (typename batch<T, A>::register_type)vec_sll((__vector unsigned char)x.data, vec_splats((uint32_t)N));
861+
XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
862+
{
863+
return batch<T, A>(0);
864+
}
865+
else
866+
{
867+
auto slider = vec_splats((uint8_t)(8 * N));
868+
return (typename batch<T, A>::register_type)vec_slo(x.data, slider);
869+
}
848870
}
849871

850872
// slide_right
851873
template <size_t N, class A, class T>
852874
XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<altivec>) noexcept
853875
{
854-
return (typename batch<T, A>::register_type)vec_srl((__vector unsigned char)x.data, vec_splats((uint32_t)N));
876+
XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
877+
{
878+
return batch<T, A>(0);
879+
}
880+
else
881+
{
882+
return (typename batch<T, A>::register_type)vec_sro((__vector unsigned char)x.data, vec_splats((uint8_t)(8 * N)));
883+
}
855884
}
856885

857886
// sadd
@@ -895,14 +924,7 @@ namespace xsimd
895924
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
896925
XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
897926
{
898-
auto tmp = vec_perm(*reinterpret_cast<const __vector unsigned char*>(&self.data), *reinterpret_cast<const __vector unsigned char*>(&self.data), vec_lvsr(0, (unsigned char*)mem));
899-
vec_ste((__vector unsigned char)tmp, 0, (unsigned char*)mem);
900-
vec_ste((__vector unsigned short)tmp, 1, (unsigned short*)mem);
901-
vec_ste((__vector unsigned int)tmp, 3, (unsigned int*)mem);
902-
vec_ste((__vector unsigned int)tmp, 4, (unsigned int*)mem);
903-
vec_ste((__vector unsigned int)tmp, 8, (unsigned int*)mem);
904-
vec_ste((__vector unsigned int)tmp, 12, (unsigned int*)mem);
905-
vec_ste((__vector unsigned short)tmp, 14, (unsigned short*)mem);
927+
*(typename batch<T, A>::register_type*)mem = self.data;
906928
}
907929

908930
// sub
@@ -1064,14 +1086,14 @@ namespace xsimd
10641086
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
10651087
XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
10661088
{
1067-
return vec_mergeh(self.data, other.data);
1089+
return vec_mergel(self.data, other.data);
10681090
}
10691091

10701092
// zip_lo
10711093
template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
10721094
XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
10731095
{
1074-
return vec_mergel(self.data, other.data);
1096+
return vec_mergeh(self.data, other.data);
10751097
}
10761098
}
10771099
}

test/test_batch_int.cpp

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -285,21 +285,15 @@ struct batch_int_test
285285
for (int32_t i = 0; i < s; ++i)
286286
{
287287
res = lhs << i;
288-
value_type expected = value_type(1) << i;
289-
for (std::size_t j = 0; j < size; ++j)
290-
{
291-
CHECK_EQ(res.get(j), expected);
292-
}
288+
batch_type expected(value_type(1) << i);
289+
CHECK_BATCH_EQ(res, expected);
293290
}
294291
lhs = batch_type(std::numeric_limits<value_type>::max());
295292
for (int32_t i = 0; i < s; ++i)
296293
{
297294
res = lhs >> i;
298-
value_type expected = std::numeric_limits<value_type>::max() >> i;
299-
for (std::size_t j = 0; j < size; ++j)
300-
{
301-
CHECK_EQ(res.get(j), expected);
302-
}
295+
batch_type expected(std::numeric_limits<value_type>::max() >> i);
296+
CHECK_BATCH_EQ(res, expected);
303297
}
304298
}
305299

0 commit comments

Comments
 (0)