Merge pull request #964 from xtensor-stack/feature/syndicate-fast-cast-code

JohanMabille · web-flow · commit eefd19cd0092 · 2023-11-01T08:41:50.000+01:00
Provide a generic version for uint32_t to float conversion, only if t…
diff --git a/include/xsimd/arch/generic/xsimd_generic_details.hpp b/include/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -197,6 +197,19 @@ namespace xsimd
                 v_hi_flt = cnst65536f * v_hi_flt; /* No rounding                                                            */
                 return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
             }
+
+            // Provide a generic float -> uint32_t cast only if we have a
+            // non-generic float -> int32_t fast_cast
+            template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+            {
+                auto is_large = v >= batch<float, A>(1u << 31);
+                auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
+                auto large = bitwise_cast<float>(
+                    batch_cast<int32_t>(v - batch<float, A>(1u << 31))
+                    ^ batch<int32_t, A>(1u << 31));
+                return bitwise_cast<uint32_t>(select(is_large, large, small));
+            }
         }
 
         namespace detail
diff --git a/include/xsimd/arch/xsimd_avx.hpp b/include/xsimd/arch/xsimd_avx.hpp
@@ -520,17 +520,6 @@ namespace xsimd
             {
                 return _mm256_cvttps_epi32(self);
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
-            {
-                return _mm256_castps_si256(
-                    _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
-                                     _mm256_xor_ps(
-                                         _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
-                                         _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
-                                     _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
-            }
         }
 
         // decr_if
diff --git a/include/xsimd/arch/xsimd_sse2.hpp b/include/xsimd/arch/xsimd_sse2.hpp
@@ -573,18 +573,6 @@ namespace xsimd
             {
                 return _mm_cvttps_epi32(self);
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
-            {
-                __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
-                __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
-                __m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
-                    _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
-                    _mm_set1_epi32(1u << 31)));
-                return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
-            }
-
         }
 
         // eq
diff --git a/include/xsimd/arch/xsimd_sse4_1.hpp b/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -65,17 +65,6 @@ namespace xsimd
                 __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
                 return _mm_add_pd(f, _mm_castsi128_pd(xL));
             }
-
-            template <class A>
-            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
-            {
-                return _mm_castps_si128(
-                    _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
-                                  _mm_castsi128_ps(_mm_xor_si128(
-                                      _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
-                                      _mm_set1_epi32(1u << 31))),
-                                  _mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
-            }
         }
 
         // eq

Original file line number	Diff line number	Diff line change
`@@ -520,17 +520,6 @@ namespace xsimd`
`520`	`520`	`{`
`521`	`521`	`return _mm256_cvttps_epi32(self);`
`522`	`522`	`}`
`523`		`-`
`524`		`- template <class A>`
`525`		`- inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept`
`526`		`- {`
`527`		`- return _mm256_castps_si256(`
`528`		`- _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),`
`529`		`- _mm256_xor_ps(`
`530`		`- _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),`
`531`		`- _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),`
`532`		`- _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));`
`533`		`- }`
`534`	`523`	`}`
`535`	`524`
`536`	`525`	`// decr_if`
Original file line number	Diff line number	Diff line change
`@@ -573,18 +573,6 @@ namespace xsimd`
`573`	`573`	`{`
`574`	`574`	`return _mm_cvttps_epi32(self);`
`575`	`575`	`}`
`576`		`-`
`577`		`- template <class A>`
`578`		`- inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept`
`579`		`- {`
`580`		`- __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));`
`581`		`- __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));`
`582`		`- __m128 rhs = _mm_castsi128_ps(_mm_xor_si128(`
`583`		`- _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),`
`584`		`- _mm_set1_epi32(1u << 31)));`
`585`		`- return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));`
`586`		`- }`
`587`		`-`
`588`	`576`	`}`
`589`	`577`
`590`	`578`	`// eq`