Skip to content

Commit 011d355

Browse files
authored
Merge pull request #963 from xtensor-stack/feature/syndicate-fast-cast-code
Provide a generic version for float to uint32_t conversion
2 parents 029aa9b + f9dcafb commit 011d355

File tree

4 files changed

+17
-46
lines changed

4 files changed

+17
-46
lines changed

include/xsimd/arch/generic/xsimd_generic_details.hpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,23 @@ namespace xsimd
180180
{
181181
return bitwise_cast<int64_t>(self);
182182
}
183+
184+
// Provide a generic uint32_t -> float cast only if we have a
185+
// non-generic int32_t -> float fast_cast
186+
template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
187+
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
188+
{
189+
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
190+
batch<uint32_t, A> msk_lo(0xFFFF);
191+
batch<float, A> cnst65536f(65536.0f);
192+
193+
auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self */
194+
auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v */
195+
auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding */
196+
auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding */
197+
v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */
198+
return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
199+
}
183200
}
184201

185202
namespace detail

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -515,22 +515,6 @@ namespace xsimd
515515
return _mm256_cvtepi32_ps(self);
516516
}
517517

518-
template <class A>
519-
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>) noexcept
520-
{
521-
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
522-
// adapted to avx
523-
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
524-
__m256 cnst65536f = _mm256_set1_ps(65536.0f);
525-
526-
__m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self */
527-
__m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v */
528-
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
529-
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
530-
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
531-
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
532-
}
533-
534518
template <class A>
535519
inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
536520
{

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -279,21 +279,6 @@ namespace xsimd
279279
namespace detail
280280
{
281281

282-
template <class A>
283-
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
284-
{
285-
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
286-
__m256i msk_lo = _mm256_set1_epi32(0xFFFF);
287-
__m256 cnst65536f = _mm256_set1_ps(65536.0f);
288-
289-
__m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self */
290-
__m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v */
291-
__m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding */
292-
__m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding */
293-
v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
294-
return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
295-
}
296-
297282
template <class A>
298283
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
299284
{

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -541,21 +541,6 @@ namespace xsimd
541541
return _mm_cvtepi32_ps(self);
542542
}
543543

544-
template <class A>
545-
inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>) noexcept
546-
{
547-
// see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
548-
__m128i msk_lo = _mm_set1_epi32(0xFFFF);
549-
__m128 cnst65536f = _mm_set1_ps(65536.0f);
550-
551-
__m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self */
552-
__m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v */
553-
__m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding */
554-
__m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding */
555-
v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding */
556-
return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
557-
}
558-
559544
template <class A>
560545
inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
561546
{

0 commit comments

Comments
 (0)