@@ -292,6 +292,35 @@ namespace xsimd
292292 return {};
293293 }
294294 }
295+ template <size_t shift, class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
296+ XSIMD_INLINE batch<T, A> bitwise_lshift (batch<T, A> const & self, requires_arch<sse2>) noexcept
297+ {
298+ static_assert (shift < std::numeric_limits<T>::digits, " Shift amount must be less than the number of value bits in the type" );
299+ XSIMD_IF_CONSTEXPR (shift == 0 )
300+ {
301+ return self;
302+ }
303+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
304+ {
305+ // 8-bit left shift via 16-bit shift + mask
306+ __m128i shifted = _mm_slli_epi16 (self, static_cast <int >(shift));
307+ __m128i mask = _mm_set1_epi8 (static_cast <char >(0xFF << shift));
308+ return _mm_and_si128 (shifted, mask);
309+ }
310+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
311+ {
312+ return _mm_slli_epi16 (self, static_cast <int >(shift));
313+ }
314+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
315+ {
316+ return _mm_slli_epi32 (self, static_cast <int >(shift));
317+ }
318+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 8 )
319+ {
320+ return _mm_slli_epi64 (self, static_cast <int >(shift));
321+ }
322+ return bitwise_lshift<shift>(self, common {});
323+ }
295324
296325 // bitwise_not
297326 template <class A >
@@ -420,6 +449,62 @@ namespace xsimd
420449 }
421450 }
422451 }
452+ template <size_t shift, class A , class T , class = typename std::enable_if<std::is_integral<T>::value, void >::type>
453+ XSIMD_INLINE batch<T, A> bitwise_rshift (batch<T, A> const & self, requires_arch<sse2>) noexcept
454+ {
455+ static_assert (shift < std::numeric_limits<T>::digits,
456+ " Shift amount must be less than the number of value bits in the type" );
457+
458+ XSIMD_IF_CONSTEXPR (shift == 0 )
459+ {
460+ return self;
461+ }
462+
463+ XSIMD_IF_CONSTEXPR (std::is_signed<T>::value)
464+ {
465+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
466+ {
467+ // 8-bit arithmetic right shift via 16-bit shift + sign-extension handling.
468+ __m128i shifted = _mm_srai_epi16 (self, static_cast <int >(shift));
469+ __m128i sign_mask = _mm_set1_epi16 (static_cast <short >(0xFF00 >> shift));
470+ __m128i cmp_negative = _mm_cmpgt_epi8 (_mm_setzero_si128 (), self);
471+ return _mm_or_si128 (_mm_and_si128 (sign_mask, cmp_negative),
472+ _mm_andnot_si128 (sign_mask, shifted));
473+ }
474+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
475+ {
476+ return _mm_srai_epi16 (self, static_cast <int >(shift));
477+ }
478+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
479+ {
480+ return _mm_srai_epi32 (self, static_cast <int >(shift));
481+ }
482+ // No 64-bit arithmetic right shift in SSE2; fall back
483+ return bitwise_rshift<shift>(self, common {});
484+ }
485+ else // unsigned / logical right shift
486+ {
487+ XSIMD_IF_CONSTEXPR (sizeof (T) == 1 )
488+ {
489+ // Emulate byte-wise logical right shift using 16-bit shifts + per-byte mask.
490+ __m128i s16 = _mm_srli_epi16 (self, static_cast <int >(shift));
491+ __m128i mask = _mm_set1_epi8 (static_cast <char >(0xFFu >> shift));
492+ return _mm_and_si128 (s16, mask);
493+ }
494+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 2 )
495+ {
496+ return _mm_srli_epi16 (self, static_cast <int >(shift));
497+ }
498+ else XSIMD_IF_CONSTEXPR (sizeof (T) == 4 )
499+ {
500+ return _mm_srli_epi32 (self, static_cast <int >(shift));
501+ }
502+ else // sizeof(T) == 8
503+ {
504+ return _mm_srli_epi64 (self, static_cast <int >(shift));
505+ }
506+ }
507+ }
423508
424509 // bitwise_xor
425510 template <class A >
@@ -1931,7 +2016,6 @@ namespace xsimd
19312016 {
19322017 return _mm_unpacklo_pd (self, other);
19332018 }
1934-
19352019 }
19362020}
19372021
0 commit comments