Skip to content

Commit c979f5e

Browse files
Extend support of batch_cast<...> to upcasting to a type twice as big
Fix #1179
1 parent cbf693c commit c979f5e

File tree

7 files changed

+165
-1
lines changed

7 files changed

+165
-1
lines changed
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/***************************************************************************
2+
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3+
* Martin Renou *
4+
* Copyright (c) QuantStack *
5+
* Copyright (c) Serge Guelton *
6+
* *
7+
* Distributed under the terms of the BSD 3-Clause License. *
8+
* *
9+
* The full license is in the file LICENSE, distributed with this software. *
10+
****************************************************************************/
11+
12+
#ifndef XSIMD_COMMON_CAST_HPP
13+
#define XSIMD_COMMON_CAST_HPP
14+
15+
#include "../../types/xsimd_traits.hpp"
16+
17+
namespace xsimd
18+
{
19+
namespace kernel
20+
{
21+
template <class A, class T>
22+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<common>) noexcept
23+
{
24+
alignas(A::alignment()) T buffer[batch<T, A>::size];
25+
x.store_aligned(&buffer[0]);
26+
27+
using T_out = widen_t<T>;
28+
alignas(A::alignment()) T_out out_buffer[batch<T, A>::size];
29+
for (size_t i = 0; i < batch<T, A>::size; ++i)
30+
out_buffer[i] = static_cast<T_out>(buffer[i]);
31+
32+
return { batch<T_out, A>::load_aligned(&out_buffer[0]),
33+
batch<T_out, A>::load_aligned(&out_buffer[batch<T_out, A>::size]) };
34+
}
35+
36+
}
37+
38+
}
39+
40+
#endif

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1918,6 +1918,23 @@ namespace xsimd
19181918
return {};
19191919
}
19201920
}
1921+
1922+
// widen
1923+
template <class A, class T>
1924+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
1925+
{
1926+
auto pair_lo = widen(batch<T, sse4_2>(_mm256_extractf128_si256(x, 0)), sse4_2 {});
1927+
auto pair_hi = widen(batch<T, sse4_2>(_mm256_extractf128_si256(x, 1)), sse4_2 {});
1928+
return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };
1929+
}
1930+
template <class A>
1931+
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept
1932+
{
1933+
__m256d lo = _mm256_cvtps_pd(_mm256_extractf128_ps(x, 0));
1934+
__m256d hi = _mm256_cvtps_pd(_mm256_extractf128_ps(x, 1));
1935+
return { lo, hi };
1936+
}
1937+
19211938
}
19221939
}
19231940

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1248,6 +1248,32 @@ namespace xsimd
12481248
return {};
12491249
}
12501250
}
1251+
1252+
// widen
1253+
template <class A, class T>
1254+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx2>) noexcept
1255+
{
1256+
__m128i x_lo = _mm256_extracti128_si256(x, 0);
1257+
__m128i x_hi = _mm256_extracti128_si256(x, 1);
1258+
__m256i lo, hi;
1259+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
1260+
{
1261+
lo = _mm256_cvtepi32_epi64(x_lo);
1262+
hi = _mm256_cvtepi32_epi64(x_hi);
1263+
}
1264+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
1265+
{
1266+
lo = _mm256_cvtepi16_epi32(x_lo);
1267+
hi = _mm256_cvtepi16_epi32(x_hi);
1268+
}
1269+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
1270+
{
1271+
lo = _mm256_cvtepi8_epi16(x_lo);
1272+
hi = _mm256_cvtepi8_epi16(x_hi);
1273+
}
1274+
return { lo, hi };
1275+
}
1276+
12511277
}
12521278
}
12531279

include/xsimd/arch/xsimd_common.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#define XSIMD_COMMON_HPP
1414

1515
#include "./common/xsimd_common_arithmetic.hpp"
16+
#include "./common/xsimd_common_cast.hpp"
1617
#include "./common/xsimd_common_complex.hpp"
1718
#include "./common/xsimd_common_logical.hpp"
1819
#include "./common/xsimd_common_math.hpp"

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <type_traits>
1616

1717
#include "../types/xsimd_sse4_1_register.hpp"
18+
#include "./common/xsimd_common_cast.hpp"
1819

1920
namespace xsimd
2021
{
@@ -382,6 +383,39 @@ namespace xsimd
382383
return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
383384
}
384385

386+
// widen
387+
template <class A, class T>
388+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<sse4_1>) noexcept
389+
{
390+
__m128i x_shuf = _mm_unpackhi_epi64(x, x);
391+
__m128i lo, hi;
392+
XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
393+
{
394+
lo = _mm_cvtepi32_epi64(x);
395+
hi = _mm_cvtepi32_epi64(x_shuf);
396+
}
397+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
398+
{
399+
lo = _mm_cvtepi16_epi32(x);
400+
hi = _mm_cvtepi16_epi32(x_shuf);
401+
}
402+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
403+
{
404+
lo = _mm_cvtepi8_epi16(x);
405+
hi = _mm_cvtepi8_epi16(x_shuf);
406+
}
407+
return { lo, hi };
408+
}
409+
template <class A>
410+
XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<sse4_1>) noexcept
411+
{
412+
__m128 x_shuf = _mm_unpackhi_ps(x, x);
413+
__m128d lo = _mm_cvtps_pd(x);
414+
__m128d hi = _mm_cvtps_pd(x_shuf);
415+
return { lo, hi };
416+
}
417+
418+
385419
}
386420

387421
}

include/xsimd/types/xsimd_api.hpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ namespace xsimd
256256
* @param x batch of \c T_in
257257
* @return \c x cast to \c T_out
258258
*/
259-
template <class T_out, class T_in, class A>
259+
template <class T_out, class T_in, class A, class = typename std::enable_if<sizeof(T_out) == sizeof(T_in), void>::type>
260260
XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
261261
{
262262
detail::static_check_supported_config<T_out, A>();
@@ -2719,6 +2719,22 @@ namespace xsimd
27192719
return !xsimd::any(x);
27202720
}
27212721

2722+
2723+
/**
2724+
* @ingroup batch_conversion
2725+
*
2726+
* Widen batch \c x from type \c T to a type with twice as many bytes and
2727+
* the same sign (for integers) or from float to double.
2728+
* @param x batch of \c T
2729+
* @return two batches of \c widen_t<T>
2730+
*/
2731+
template <class T, class A>
2732+
XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x) noexcept
2733+
{
2734+
detail::static_check_supported_config<T, A>();
2735+
return kernel::widen<A>(x, A {});
2736+
}
2737+
27222738
/**
27232739
* @ingroup batch_miscellaneous
27242740
*

include/xsimd/types/xsimd_traits.hpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,36 @@ namespace xsimd
332332

333333
template <class T>
334334
using mask_type_t = typename mask_type<T>::type;
335+
336+
337+
namespace detail {
338+
template <typename T>
339+
struct widen : widen<typename std::make_unsigned<T>::type> {};
340+
341+
template <>
342+
struct widen<uint32_t>
343+
{
344+
using type = uint64_t;
345+
};
346+
template <>
347+
struct widen<uint16_t>
348+
{
349+
using type = uint32_t;
350+
};
351+
template <>
352+
struct widen<uint8_t>
353+
{
354+
using type = uint8_t;
355+
};
356+
template <>
357+
struct widen<float>
358+
{
359+
using type = double;
360+
};
361+
}
362+
template <typename T>
363+
using widen_t = typename detail::widen<T>::type;
364+
335365
}
336366

337367
#endif

0 commit comments

Comments
 (0)