Skip to content

Commit eefd19c

Browse files
authored
Merge pull request #964 from xtensor-stack/feature/syndicate-fast-cast-code
Provide a generic version for uint32_t to float conversion, only if t…
2 parents 011d355 + 0ba53ef commit eefd19c

File tree

4 files changed

+13
-34
lines changed

4 files changed

+13
-34
lines changed

include/xsimd/arch/generic/xsimd_generic_details.hpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,6 +197,19 @@ namespace xsimd
197197
v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */
198198
return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
199199
}
200+
201+
// Provide a generic float -> uint32_t cast only if we have a
202+
// non-generic float -> int32_t fast_cast
203+
template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
204+
inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
205+
{
206+
auto is_large = v >= batch<float, A>(1u << 31);
207+
auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
208+
auto large = bitwise_cast<float>(
209+
batch_cast<int32_t>(v - batch<float, A>(1u << 31))
210+
^ batch<int32_t, A>(1u << 31));
211+
return bitwise_cast<uint32_t>(select(is_large, large, small));
212+
}
200213
}
201214

202215
namespace detail

include/xsimd/arch/xsimd_avx.hpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -520,17 +520,6 @@ namespace xsimd
520520
{
521521
return _mm256_cvttps_epi32(self);
522522
}
523-
524-
template <class A>
525-
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
526-
{
527-
return _mm256_castps_si256(
528-
_mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
529-
_mm256_xor_ps(
530-
_mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
531-
_mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
532-
_mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
533-
}
534523
}
535524

536525
// decr_if

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -573,18 +573,6 @@ namespace xsimd
573573
{
574574
return _mm_cvttps_epi32(self);
575575
}
576-
577-
template <class A>
578-
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
579-
{
580-
__m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
581-
__m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
582-
__m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
583-
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
584-
_mm_set1_epi32(1u << 31)));
585-
return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
586-
}
587-
588576
}
589577

590578
// eq

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -65,17 +65,6 @@ namespace xsimd
6565
__m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52
6666
return _mm_add_pd(f, _mm_castsi128_pd(xL));
6767
}
68-
69-
template <class A>
70-
inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
71-
{
72-
return _mm_castps_si128(
73-
_mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
74-
_mm_castsi128_ps(_mm_xor_si128(
75-
_mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
76-
_mm_set1_epi32(1u << 31))),
77-
_mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
78-
}
7968
}
8069

8170
// eq

0 commit comments

Comments
 (0)