@@ -87,7 +87,7 @@ NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride)
8787#if 0 // slower
8888NPY_FINLINE npyv_u64 npyv_loadn_u64 (const npy_uint64 * ptr , npy_intp stride )
8989{
90- const __m256i idx = _mm256_setr_epi64x (0 , 1 * stride , 2 * stride , 3 * stride );
90+ const __m256i idx = npyv_set_s64 (0 , 1 * stride , 2 * stride , 3 * stride );
9191 return _mm256_i64gather_epi64 ((const void * )ptr , idx , 8 );
9292}
9393NPY_FINLINE npyv_s64 npyv_loadn_s64 (const npy_int64 * ptr , npy_intp stride )
@@ -170,9 +170,9 @@ NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane)
170170NPY_FINLINE npyv_s64 npyv_load_till_s64 (const npy_int64 * ptr , npy_uintp nlane , npy_int64 fill )
171171{
172172 assert (nlane > 0 );
173- const __m256i vfill = _mm256_set1_epi64x (fill );
174- const __m256i steps = _mm256_setr_epi64x (0 , 1 , 2 , 3 );
175- __m256i vnlane = _mm256_set1_epi64x (nlane > 4 ? 4 : (int )nlane );
173+ const __m256i vfill = npyv_setall_s64 (fill );
174+ const __m256i steps = npyv_set_s64 (0 , 1 , 2 , 3 );
175+ __m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
176176 __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
177177 __m256i payload = _mm256_maskload_epi64 ((const void * )ptr , mask );
178178 return _mm256_blendv_epi8 (vfill , payload , mask );
@@ -181,8 +181,8 @@ NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, n
181181NPY_FINLINE npyv_s64 npyv_load_tillz_s64 (const npy_int64 * ptr , npy_uintp nlane )
182182{
183183 assert (nlane > 0 );
184- const __m256i steps = _mm256_setr_epi64x (0 , 1 , 2 , 3 );
185- __m256i vnlane = _mm256_set1_epi64x (nlane > 4 ? 4 : (int )nlane );
184+ const __m256i steps = npyv_set_s64 (0 , 1 , 2 , 3 );
185+ __m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
186186 __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
187187 return _mm256_maskload_epi64 ((const void * )ptr , mask );
188188}
@@ -211,10 +211,10 @@ NPY_FINLINE npyv_s64
211211npyv_loadn_till_s64 (const npy_int64 * ptr , npy_intp stride , npy_uintp nlane , npy_int64 fill )
212212{
213213 assert (nlane > 0 );
214- const __m256i vfill = _mm256_set1_epi64x (fill );
215- const __m256i idx = _mm256_setr_epi64x (0 , 1 * stride , 2 * stride , 3 * stride );
216- const __m256i steps = _mm256_setr_epi64x (0 , 1 , 2 , 3 );
217- __m256i vnlane = _mm256_set1_epi64x (nlane > 4 ? 4 : (int )nlane );
214+ const __m256i vfill = npyv_setall_s64 (fill );
215+ const __m256i idx = npyv_set_s64 (0 , 1 * stride , 2 * stride , 3 * stride );
216+ const __m256i steps = npyv_set_s64 (0 , 1 , 2 , 3 );
217+ __m256i vnlane = npyv_setall_s64 (nlane > 4 ? 4 : (int )nlane );
218218 __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
219219 return _mm256_mask_i64gather_epi64 (vfill , (const void * )ptr , idx , mask , 8 );
220220}
@@ -238,8 +238,8 @@ NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a
238238NPY_FINLINE void npyv_store_till_s64 (npy_int64 * ptr , npy_uintp nlane , npyv_s64 a )
239239{
240240 assert (nlane > 0 );
241- const __m256i steps = _mm256_setr_epi64x (0 , 1 , 2 , 3 );
242- __m256i vnlane = _mm256_set1_epi64x (nlane > 8 ? 8 : (int )nlane );
241+ const __m256i steps = npyv_set_s64 (0 , 1 , 2 , 3 );
242+ __m256i vnlane = npyv_setall_s64 (nlane > 8 ? 8 : (int )nlane );
243243 __m256i mask = _mm256_cmpgt_epi64 (vnlane , steps );
244244 _mm256_maskstore_epi64 ((void * )ptr , mask , a );
245245}
0 commit comments