@@ -1131,24 +1131,22 @@ _mm256_inserti128_si256(__m256i __a, __m128i __b, const int imm8) {
11311131
11321132static __inline__ __m128i __attribute__((__always_inline__ , __nodebug__ ))
11331133_mm_maskload_epi32 (int32_t const * __p , __m128i __m ) {
1134- // This may cause an out-of-bounds memory load since we first load and
1135- // then mask, but since there are no segmentation faults in Wasm memory
1136- // accesses, that is ok (as long as we are within the heap bounds -
1137- // a negligible limitation in practice)
1138- // TODO, loadu or load, 128-bit align?
1139- return _mm_and_si128 (_mm_load_si128 ((const __m128i * )__p ),
1140- _mm_srai_epi32 (__m , 31 ));
1134+ int32_t lane [4 ];
1135+ for (size_t i = 0 ; i < 4 ; i ++ ) {
1136+ uint32_t mask = ((__i32x4 )__m )[i ];
1137+ lane [i ] = ((mask >> 31 ) & 0x1 ) ? __p [i ] : 0 ;
1138+ }
1139+ return (__m128i )wasm_i32x4_make (lane [0 ], lane [1 ], lane [2 ], lane [3 ]);
11411140}
11421141
11431142static __inline__ __m128i __attribute__((__always_inline__ , __nodebug__ ))
11441143_mm_maskload_epi64 (int64_t const * __p , __m128i __m ) {
1145- // This may cause an out-of-bounds memory load since we first load and
1146- // then mask, but since there are no segmentation faults in Wasm memory
1147- // accesses, that is ok (as long as we are within the heap bounds -
1148- // a negligible limitation in practice)
1149- // TODO, loadu or load, 128-bit align?
1150- return _mm_and_si128 (_mm_load_si128 ((const __m128i * )__p ),
1151- wasm_i64x2_shr (__m , 63 ));
1144+ int64_t lane [2 ];
1145+ for (size_t i = 0 ; i < 2 ; i ++ ) {
1146+ uint64_t mask = ((__i64x2 )__m )[i ];
1147+ lane [i ] = ((mask >> 63 ) & 0x1 ) ? __p [i ] : 0 ;
1148+ }
1149+ return (__m128i )wasm_i64x2_make (lane [0 ], lane [1 ]);
11521150}
11531151
11541152static __inline__ __m256i __attribute__((__always_inline__ , __nodebug__ ))
0 commit comments