@@ -398,8 +398,15 @@ auto left_shift_no_overflow(const xsimd::batch<Int, Arch>& batch,
398398 };
399399
400400 constexpr auto kMults = xsimd::make_batch_constant<Int, Arch, MakeMults>();
401- return batch * kMults ;
402-
401+ // TODO in xsimd 14.0 this can be simplified to
402+ // constexpr auto kMults = xsimd::make_batch_constant<Int, 1, Arch>() << shits;
403+ if constexpr (sizeof (Int) == sizeof (uint16_t )) {
404+ return _mm_mullo_epi16 (batch, kMults .as_batch ());
405+ }
406+ if constexpr (sizeof (Int) == sizeof (uint16_t )) {
407+ // TODO that is latency 10 so maybe it is not worth it
408+ return _mm_mullo_epi32 (batch, kMults .as_batch ());
409+ }
403410 } else {
404411 return batch << shifts;
405412 }
@@ -434,6 +441,15 @@ auto right_shift_by_excess(const xsimd::batch<Int, Arch>& batch,
434441 };
435442
436443 constexpr auto kMults = xsimd::make_batch_constant<Int, Arch, MakeMults>();
444+ if constexpr (sizeof (Int) == sizeof (uint16_t )) {
445+ return xsimd::batch<Int, Arch>(_mm_mullo_epi16 (batch, kMults .as_batch ())) >>
446+ kMaxRightShift ;
447+ }
448+ if constexpr (sizeof (Int) == sizeof (uint16_t )) {
449+ // TODO that is latency 10 so maybe it is not worth it
450+ return xsimd::batch<Int, Arch>(_mm_mullo_epi32 (batch, kMults .as_batch ())) >>
451+ kMaxRightShift ;
452+ }
437453 return (batch * kMults ) >> kMaxRightShift ;
438454
439455 } else {
0 commit comments