Skip to content

Commit 4ab1d66

Browse files
committed
optimized sse2
1 parent 18b3c9d commit 4ab1d66

File tree

1 file changed

+85
-1
lines changed

1 file changed

+85
-1
lines changed

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 85 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,35 @@ namespace xsimd
292292
return {};
293293
}
294294
}
295+
template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
296+
XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
297+
{
298+
static_assert(shift < std::numeric_limits<T>::digits, "Shift amount must be less than the number of value bits in the type");
299+
XSIMD_IF_CONSTEXPR(shift == 0)
300+
{
301+
return self;
302+
}
303+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
304+
{
305+
// 8-bit left shift via 16-bit shift + mask
306+
__m128i shifted = _mm_slli_epi16(self, static_cast<int>(shift));
307+
__m128i mask = _mm_set1_epi8(static_cast<char>(0xFF << shift));
308+
return _mm_and_si128(shifted, mask);
309+
}
310+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
311+
{
312+
return _mm_slli_epi16(self, static_cast<int>(shift));
313+
}
314+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
315+
{
316+
return _mm_slli_epi32(self, static_cast<int>(shift));
317+
}
318+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
319+
{
320+
return _mm_slli_epi64(self, static_cast<int>(shift));
321+
}
322+
return bitwise_lshift<shift>(self, common {});
323+
}
295324

296325
// bitwise_not
297326
template <class A>
@@ -420,6 +449,62 @@ namespace xsimd
420449
}
421450
}
422451
}
452+
template <size_t shift, class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
453+
XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, requires_arch<sse2>) noexcept
454+
{
455+
static_assert(shift < std::numeric_limits<T>::digits,
456+
"Shift amount must be less than the number of value bits in the type");
457+
458+
XSIMD_IF_CONSTEXPR(shift == 0)
459+
{
460+
return self;
461+
}
462+
463+
XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
464+
{
465+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
466+
{
467+
// 8-bit arithmetic right shift via 16-bit shift + sign-extension handling.
468+
__m128i shifted = _mm_srai_epi16(self, static_cast<int>(shift));
469+
__m128i sign_mask = _mm_set1_epi16(static_cast<short>(0xFF00 >> shift));
470+
__m128i cmp_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
471+
return _mm_or_si128(_mm_and_si128(sign_mask, cmp_negative),
472+
_mm_andnot_si128(sign_mask, shifted));
473+
}
474+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
475+
{
476+
return _mm_srai_epi16(self, static_cast<int>(shift));
477+
}
478+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
479+
{
480+
return _mm_srai_epi32(self, static_cast<int>(shift));
481+
}
482+
// No 64-bit arithmetic right shift in SSE2; fall back
483+
return bitwise_rshift<shift>(self, common {});
484+
}
485+
else // unsigned / logical right shift
486+
{
487+
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
488+
{
489+
// Emulate byte-wise logical right shift using 16-bit shifts + per-byte mask.
490+
__m128i s16 = _mm_srli_epi16(self, static_cast<int>(shift));
491+
__m128i mask = _mm_set1_epi8(static_cast<char>(0xFFu >> shift));
492+
return _mm_and_si128(s16, mask);
493+
}
494+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
495+
{
496+
return _mm_srli_epi16(self, static_cast<int>(shift));
497+
}
498+
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
499+
{
500+
return _mm_srli_epi32(self, static_cast<int>(shift));
501+
}
502+
else // sizeof(T) == 8
503+
{
504+
return _mm_srli_epi64(self, static_cast<int>(shift));
505+
}
506+
}
507+
}
423508

424509
// bitwise_xor
425510
template <class A>
@@ -1931,7 +2016,6 @@ namespace xsimd
19312016
{
19322017
return _mm_unpacklo_pd(self, other);
19332018
}
1934-
19352019
}
19362020
}
19372021

0 commit comments

Comments
 (0)