Skip to content

Commit 50f8885

Browse files
committed
Change _mm_maskload_epi32/_mm_maskload_epi64 implementation
1 parent b12f35b commit 50f8885

File tree

2 files changed

+14
-16
lines changed

2 files changed

+14
-16
lines changed

site/source/docs/porting/simd.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1177,9 +1177,9 @@ The following table highlights the availability and expected performance of diff
11771177
* - _mm256_permute4x64_epi64
11781178
- 💡 emulated with two general shuffle
11791179
* - _mm_maskload_epi32
1180-
- ⚠️ emulated with SIMD load+shift+and
1180+
- ❌ scalarized
11811181
* - _mm_maskload_epi64
1182-
- ⚠️ emulated with SIMD load+shift+and
1182+
- ❌ scalarized
11831183
* - _mm_maskstore_epi32
11841184
- ❌ scalarized
11851185
* - _mm_maskstore_epi64

system/include/compat/avx2intrin.h

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,24 +1131,22 @@ _mm256_inserti128_si256(__m256i __a, __m128i __b, const int imm8) {
11311131

11321132
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
11331133
_mm_maskload_epi32(int32_t const* __p, __m128i __m) {
1134-
// This may cause an out-of-bounds memory load since we first load and
1135-
// then mask, but since there are no segmentation faults in Wasm memory
1136-
// accesses, that is ok (as long as we are within the heap bounds -
1137-
// a negligible limitation in practice)
1138-
// TODO, loadu or load, 128-bit align?
1139-
return _mm_and_si128(_mm_load_si128((const __m128i*)__p),
1140-
_mm_srai_epi32(__m, 31));
1134+
int32_t lane[4];
1135+
for (size_t i = 0; i < 4; i++) {
1136+
uint32_t mask = ((__i32x4)__m)[i];
1137+
lane[i] = ((mask >> 31) & 0x1) ? __p[i] : 0;
1138+
}
1139+
return (__m128i)wasm_i32x4_make(lane[0], lane[1], lane[2], lane[3]);
11411140
}
11421141

11431142
static __inline__ __m128i __attribute__((__always_inline__, __nodebug__))
11441143
_mm_maskload_epi64(int64_t const* __p, __m128i __m) {
1145-
// This may cause an out-of-bounds memory load since we first load and
1146-
// then mask, but since there are no segmentation faults in Wasm memory
1147-
// accesses, that is ok (as long as we are within the heap bounds -
1148-
// a negligible limitation in practice)
1149-
// TODO, loadu or load, 128-bit align?
1150-
return _mm_and_si128(_mm_load_si128((const __m128i*)__p),
1151-
wasm_i64x2_shr(__m, 63));
1144+
int64_t lane[2];
1145+
for (size_t i = 0; i < 2; i++) {
1146+
uint64_t mask = ((__i64x2)__m)[i];
1147+
lane[i] = ((mask >> 63) & 0x1) ? __p[i] : 0;
1148+
}
1149+
return (__m128i)wasm_i64x2_make(lane[0], lane[1]);
11521150
}
11531151

11541152
static __inline__ __m256i __attribute__((__always_inline__, __nodebug__))

0 commit comments

Comments
 (0)