Skip to content

Commit 82e677c

Browse files
committed
change avx2 loads, fix some random errors
1 parent 176b336 commit 82e677c

File tree

3 files changed

+7
-10
lines changed

3 files changed

+7
-10
lines changed

include/xsimd/arch/xsimd_avx2.hpp

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -579,29 +579,26 @@ namespace xsimd
579579
{
580580
XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
581581
{
582-
auto maskz = _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i const*)mem), _mm256_set1_epi8(0));
583-
return _mm256_xor_si256(maskz, _mm256_set1_epi8(-1));
582+
return _mm256_sub_epi8(_mm256_set1_epi8(0), _mm256_loadu_si256((__m256i const*)mem));
584583
}
585584
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
586585
{
587586
auto bpack = _mm_loadu_si128((__m128i const*)mem);
588-
return _mm256_cmpgt_epi16(_mm256_cvtepu8_epi16(bpack), _mm256_set1_epi16(0));
587+
return _mm256_sub_epi16(_mm256_set1_epi8(0), _mm256_cvtepu8_epi16(bpack));
589588
}
590589
// GCC <12 have missing or buggy unaligned load intrinsics; use memcpy to work around this.
591590
// GCC/Clang/MSVC will turn it into the correct load.
592591
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
593592
{
594593
uint64_t tmp;
595594
memcpy(&tmp, mem, sizeof(tmp));
596-
auto bpack = _mm_cvtsi64_si128(tmp);
597-
return _mm256_cmpgt_epi32(_mm256_cvtepu8_epi32(bpack), _mm256_set1_epi32(0));
595+
return _mm256_sub_epi32(_mm256_set1_epi8(0), _mm256_cvtepu8_epi32(_mm_cvtsi64_si128(tmp)));
598596
}
599597
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
600598
{
601599
uint32_t tmp;
602600
memcpy(&tmp, mem, sizeof(tmp));
603-
auto bpack = _mm_cvtsi32_si128(tmp);
604-
return _mm256_cmpgt_epi64(_mm256_cvtepu8_epi64(bpack), _mm256_set1_epi64x(0));
601+
return _mm256_sub_epi64(_mm256_set1_epi8(0), _mm256_cvtepu8_epi64(_mm_cvtsi32_si128(tmp)));
605602
}
606603
else
607604
{

include/xsimd/arch/xsimd_sse2.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1713,8 +1713,9 @@ namespace xsimd
17131713
}
17141714
else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
17151715
{
1716-
uint64_t val = _mm_cvtsi128_si64(_mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b)));
1717-
memcpy(mem, &val, sizeof(val));
1716+
auto val = _mm_sub_epi8(_mm_set1_epi8(0), _mm_packs_epi16(b, b));
1717+
// store only lower 64 bits
1718+
memcpy(mem, &val, sizeof(uint64_t));
17181719
}
17191720
else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
17201721
{

include/xsimd/arch/xsimd_sse4_1.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,6 @@ namespace xsimd
144144
{
145145
uint32_t tmp;
146146
memcpy(&tmp, mem, sizeof(tmp));
147-
auto bpack = _mm_cvtsi32_si128(tmp);
148147
return _mm_sub_epi32(_mm_set1_epi8(0), _mm_cvtepu8_epi32(_mm_cvtsi32_si128(tmp)));
149148
}
150149
else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)

0 commit comments

Comments
 (0)