4545
4646#ifdef SIMD_AVX2_FMA3
4747
48- static NPY_INLINE __m256
48+ NPY_FINLINE __m256
4949fma_get_full_load_mask_ps(void)
5050{
5151 return _mm256_set1_ps(-1.0);
5252}
5353
54- static NPY_INLINE __m256i
54+ NPY_FINLINE __m256i
5555fma_get_full_load_mask_pd(void)
5656{
5757 return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
5858}
5959
60- static NPY_INLINE __m256
60+ NPY_FINLINE __m256
6161fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
6262{
6363 float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -66,15 +66,15 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
6666 return _mm256_loadu_ps(addr);
6767}
6868
69- static NPY_INLINE __m256i
69+ NPY_FINLINE __m256i
7070fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
7171{
7272 npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
7373 npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
7474 return _mm256_loadu_si256((__m256i*) addr);
7575}
7676
77- static NPY_INLINE __m256
77+ NPY_FINLINE __m256
7878fma_masked_gather_ps(__m256 src,
7979 npy_float* addr,
8080 __m256i vindex,
@@ -83,7 +83,7 @@ fma_masked_gather_ps(__m256 src,
8383 return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
8484}
8585
86- static NPY_INLINE __m256d
86+ NPY_FINLINE __m256d
8787fma_masked_gather_pd(__m256d src,
8888 npy_double* addr,
8989 __m128i vindex,
@@ -92,49 +92,49 @@ fma_masked_gather_pd(__m256d src,
9292 return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
9393}
9494
95- static NPY_INLINE __m256
95+ NPY_FINLINE __m256
9696fma_masked_load_ps(__m256 mask, npy_float* addr)
9797{
9898 return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
9999}
100100
101- static NPY_INLINE __m256d
101+ NPY_FINLINE __m256d
102102fma_masked_load_pd(__m256i mask, npy_double* addr)
103103{
104104 return _mm256_maskload_pd(addr, mask);
105105}
106106
107- static NPY_INLINE __m256
107+ NPY_FINLINE __m256
108108fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
109109{
110110 return _mm256_blendv_ps(x, val, mask);
111111}
112112
113- static NPY_INLINE __m256d
113+ NPY_FINLINE __m256d
114114fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
115115{
116116 return _mm256_blendv_pd(x, val, mask);
117117}
118118
119- static NPY_INLINE __m256
119+ NPY_FINLINE __m256
120120fma_blend(__m256 x, __m256 y, __m256 ymask)
121121{
122122 return _mm256_blendv_ps(x, y, ymask);
123123}
124124
125- static NPY_INLINE __m256
125+ NPY_FINLINE __m256
126126fma_invert_mask_ps(__m256 ymask)
127127{
128128 return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
129129}
130130
131- static NPY_INLINE __m256i
131+ NPY_FINLINE __m256i
132132fma_invert_mask_pd(__m256i ymask)
133133{
134134 return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
135135}
136136
137- static NPY_INLINE __m256
137+ NPY_FINLINE __m256
138138fma_get_exponent(__m256 x)
139139{
140140 /*
@@ -165,7 +165,7 @@ fma_get_exponent(__m256 x)
165165 return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
166166}
167167
168- static NPY_INLINE __m256
168+ NPY_FINLINE __m256
169169fma_get_mantissa(__m256 x)
170170{
171171 /*
@@ -195,7 +195,7 @@ fma_get_mantissa(__m256 x)
195195 _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
196196}
197197
198- static NPY_INLINE __m256
198+ NPY_FINLINE __m256
199199fma_scalef_ps(__m256 poly, __m256 quadrant)
200200{
201201 /*
@@ -238,31 +238,31 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
238238
239239#ifdef SIMD_AVX512F
240240
241- static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
241+ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
242242avx512_get_full_load_mask_ps(void)
243243{
244244 return 0xFFFF;
245245}
246246
247- static NPY_INLINE __mmask8
247+ NPY_FINLINE __mmask8
248248avx512_get_full_load_mask_pd(void)
249249{
250250 return 0xFF;
251251}
252252
253- static NPY_INLINE __mmask16
253+ NPY_FINLINE __mmask16
254254avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
255255{
256256 return (0x0001 << num_elem) - 0x0001;
257257}
258258
259- static NPY_INLINE __mmask8
259+ NPY_FINLINE __mmask8
260260avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
261261{
262262 return (0x01 << num_elem) - 0x01;
263263}
264264
265- static NPY_INLINE __m512
265+ NPY_FINLINE __m512
266266avx512_masked_gather_ps(__m512 src,
267267 npy_float* addr,
268268 __m512i vindex,
@@ -271,7 +271,7 @@ avx512_masked_gather_ps(__m512 src,
271271 return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
272272}
273273
274- static NPY_INLINE __m512d
274+ NPY_FINLINE __m512d
275275avx512_masked_gather_pd(__m512d src,
276276 npy_double* addr,
277277 __m256i vindex,
@@ -280,67 +280,67 @@ avx512_masked_gather_pd(__m512d src,
280280 return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
281281}
282282
283- static NPY_INLINE __m512
283+ NPY_FINLINE __m512
284284avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
285285{
286286 return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
287287}
288288
289- static NPY_INLINE __m512d
289+ NPY_FINLINE __m512d
290290avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
291291{
292292 return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
293293}
294294
295- static NPY_INLINE __m512
295+ NPY_FINLINE __m512
296296avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
297297{
298298 return _mm512_mask_blend_ps(mask, x, val);
299299}
300300
301- static NPY_INLINE __m512d
301+ NPY_FINLINE __m512d
302302avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
303303{
304304 return _mm512_mask_blend_pd(mask, x, val);
305305}
306306
307- static NPY_INLINE __m512
307+ NPY_FINLINE __m512
308308avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
309309{
310310 return _mm512_mask_mov_ps(x, ymask, y);
311311}
312312
313- static NPY_INLINE __mmask16
313+ NPY_FINLINE __mmask16
314314avx512_invert_mask_ps(__mmask16 ymask)
315315{
316316 return _mm512_knot(ymask);
317317}
318318
319- static NPY_INLINE __mmask8
319+ NPY_FINLINE __mmask8
320320avx512_invert_mask_pd(__mmask8 ymask)
321321{
322322 return _mm512_knot(ymask);
323323}
324324
325- static NPY_INLINE __m512
325+ NPY_FINLINE __m512
326326avx512_get_exponent(__m512 x)
327327{
328328 return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
329329}
330330
331- static NPY_INLINE __m512
331+ NPY_FINLINE __m512
332332avx512_get_mantissa(__m512 x)
333333{
334334 return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
335335}
336336
337- static NPY_INLINE __m512
337+ NPY_FINLINE __m512
338338avx512_scalef_ps(__m512 poly, __m512 quadrant)
339339{
340340 return _mm512_scalef_ps(poly, quadrant);
341341}
342342
343- static NPY_INLINE __m512d
343+ NPY_FINLINE __m512d
344344avx512_permute_x4var_pd(__m512d t0,
345345 __m512d t1,
346346 __m512d t2,
@@ -355,7 +355,7 @@ avx512_permute_x4var_pd(__m512d t0,
355355 return _mm512_mask_blend_pd(lut_mask, res1, res2);
356356}
357357
358- static NPY_INLINE __m512d
358+ NPY_FINLINE __m512d
359359avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
360360 __m512d t4, __m512d t5, __m512d t6, __m512d t7,
361361 __m512i index)
@@ -401,7 +401,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
401401 * 3) x* = x - y*c3
402402 * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
403403 */
404- static NPY_INLINE @vtype@
404+ NPY_FINLINE @vtype@
405405simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
406406{
407407 @vtype@ reduced_x = @fmadd@(y, c1, x);
0 commit comments