Extend support of batch_cast<...> to upcasting to a type twice as big

serge-sans-paille · serge-sans-paille · commit c979f5eb8892 · 2025-10-30T10:44:27.000+01:00
Fix #1179
diff --git a/include/xsimd/arch/common/xsimd_common_cast.hpp b/include/xsimd/arch/common/xsimd_common_cast.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_COMMON_CAST_HPP
+#define XSIMD_COMMON_CAST_HPP
+
+#include "../../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<common>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            x.store_aligned(&buffer[0]);
+
+            using T_out = widen_t<T>;
+            alignas(A::alignment()) T_out out_buffer[batch<T, A>::size];
+            for (size_t i = 0; i < batch<T, A>::size; ++i)
+                out_buffer[i] = static_cast<T_out>(buffer[i]);
+
+            return { batch<T_out, A>::load_aligned(&out_buffer[0]),
+                     batch<T_out, A>::load_aligned(&out_buffer[batch<T_out, A>::size]) };
+        }
+
+    }
+
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -1918,6 +1918,23 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            auto pair_lo = widen(batch<T, sse4_2>(_mm256_extractf128_si256(x, 0)), sse4_2 {});
+            auto pair_hi = widen(batch<T, sse4_2>(_mm256_extractf128_si256(x, 1)), sse4_2 {});
+            return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept
+        {
+            __m256d lo = _mm256_cvtps_pd(_mm256_extractf128_ps(x, 0));
+            __m256d hi = _mm256_cvtps_pd(_mm256_extractf128_ps(x, 1));
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_avx2.hpp b/include/xsimd/arch/xsimd_avx2.hpp
@@ -1248,6 +1248,32 @@ namespace xsimd
                 return {};
             }
         }
+
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            __m128i x_lo = _mm256_extracti128_si256(x, 0);
+            __m128i x_hi = _mm256_extracti128_si256(x, 1);
+            __m256i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm256_cvtepi32_epi64(x_lo);
+                hi = _mm256_cvtepi32_epi64(x_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm256_cvtepi16_epi32(x_lo);
+                hi = _mm256_cvtepi16_epi32(x_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm256_cvtepi8_epi16(x_lo);
+                hi = _mm256_cvtepi8_epi16(x_hi);
+            }
+            return { lo, hi };
+        }
+
     }
 }
 
diff --git a/include/xsimd/arch/xsimd_common.hpp b/include/xsimd/arch/xsimd_common.hpp
@@ -13,6 +13,7 @@
 #define XSIMD_COMMON_HPP
 
 #include "./common/xsimd_common_arithmetic.hpp"
+#include "./common/xsimd_common_cast.hpp"
 #include "./common/xsimd_common_complex.hpp"
 #include "./common/xsimd_common_logical.hpp"
 #include "./common/xsimd_common_math.hpp"
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -15,6 +15,7 @@
 #include <type_traits>
 
 #include "../types/xsimd_sse4_1_register.hpp"
+#include "./common/xsimd_common_cast.hpp"
 
 namespace xsimd
 {
@@ -382,6 +383,39 @@ namespace xsimd
             return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
         }
 
+        // widen
+        template <class A, class T>
+        XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128i x_shuf = _mm_unpackhi_epi64(x, x);
+            __m128i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm_cvtepi32_epi64(x);
+                hi = _mm_cvtepi32_epi64(x_shuf);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm_cvtepi16_epi32(x);
+                hi = _mm_cvtepi16_epi32(x_shuf);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm_cvtepi8_epi16(x);
+                hi = _mm_cvtepi8_epi16(x_shuf);
+            }
+            return { lo, hi };
+        }
+        template <class A>
+        XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<sse4_1>) noexcept
+        {
+            __m128 x_shuf = _mm_unpackhi_ps(x, x);
+            __m128d lo = _mm_cvtps_pd(x);
+            __m128d hi = _mm_cvtps_pd(x_shuf);
+            return { lo, hi };
+        }
+
+
     }
 
 }
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
@@ -256,7 +256,7 @@ namespace xsimd
      * @param x batch of \c T_in
      * @return \c x cast to \c T_out
      */
-    template <class T_out, class T_in, class A>
+    template <class T_out, class T_in, class A, class = typename std::enable_if<sizeof(T_out) == sizeof(T_in), void>::type>
     XSIMD_INLINE batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
     {
         detail::static_check_supported_config<T_out, A>();
@@ -2719,6 +2719,22 @@ namespace xsimd
         return !xsimd::any(x);
     }
 
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Widen batch \c x from type \c T to a type with twice as many bytes and
+     * the same sign (for integers) or from float to double.
+     * @param x batch of \c T
+     * @return two batches of \c widen_t<T>
+     */
+    template <class T, class A>
+    XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::widen<A>(x, A {});
+    }
+
     /**
      * @ingroup batch_miscellaneous
      *
diff --git a/include/xsimd/types/xsimd_traits.hpp b/include/xsimd/types/xsimd_traits.hpp
@@ -332,6 +332,36 @@ namespace xsimd
 
     template <class T>
     using mask_type_t = typename mask_type<T>::type;
+
+
+    namespace detail {
+    template <typename T>
+    struct widen : widen<typename std::make_unsigned<T>::type> {};
+
+    template <>
+    struct widen<uint32_t>
+    {
+        using type = uint64_t;
+    };
+    template <>
+    struct widen<uint16_t>
+    {
+        using type = uint32_t;
+    };
+    template <>
+    struct widen<uint8_t>
+    {
+        using type = uint8_t;
+    };
+    template <>
+    struct widen<float>
+    {
+        using type = double;
+    };
+    }
+    template <typename T>
+    using widen_t = typename detail::widen<T>::type;
+
 }
 
 #endif

Original file line number	Diff line number	Diff line change
`@@ -1918,6 +1918,23 @@ namespace xsimd`
`1918`	`1918`	`return {};`
`1919`	`1919`	`}`
`1920`	`1920`	`}`
	`1921`	`+`
	`1922`	`+ // widen`
	`1923`	`+ template <class A, class T>`
	`1924`	`+ XSIMD_INLINE std::array<batch<widen_t<T>, A>, 2> widen(batch<T, A> const& x, requires_arch<avx>) noexcept`
	`1925`	`+ {`
	`1926`	`+ auto pair_lo = widen(batch<T, sse4_2>(_mm256_extractf128_si256(x, 0)), sse4_2 {});`
	`1927`	`+ auto pair_hi = widen(batch<T, sse4_2>(_mm256_extractf128_si256(x, 1)), sse4_2 {});`
	`1928`	`+ return { detail::merge_sse(pair_lo[0], pair_lo[1]), detail::merge_sse(pair_hi[0], pair_hi[1]) };`
	`1929`	`+ }`
	`1930`	`+ template <class A>`
	`1931`	`+ XSIMD_INLINE std::array<batch<double, A>, 2> widen(batch<float, A> const& x, requires_arch<avx>) noexcept`
	`1932`	`+ {`
	`1933`	`+ __m256d lo = _mm256_cvtps_pd(_mm256_extractf128_ps(x, 0));`
	`1934`	`+ __m256d hi = _mm256_cvtps_pd(_mm256_extractf128_ps(x, 1));`
	`1935`	`+ return { lo, hi };`
	`1936`	`+ }`
	`1937`	`+`
`1921`	`1938`	`}`
`1922`	`1939`	`}`
`1923`	`1940`