Merge pull request numpy#19529 from DWesl/simd-function-re-inline

seberg · web-flow · commit 2a6a8f5a03ca · 2021-07-20T10:12:22.000-07:00
SIMD: Force inlining all functions that accept AVX registers
diff --git a/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src b/numpy/core/src/umath/loops_arithm_fp.dispatch.c.src
@@ -565,36 +565,36 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)
 #endif
 
 #ifdef AVX512F_NOMSVC
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_get_full_load_mask_pd(void)
 {
     return 0xFF;
 }
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
 {
     return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8
 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x01 << num_elem) - 0x01;
@@ -613,18 +613,18 @@ avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem
  *  #INF = NPY_INFINITYF, NPY_INFINITY#
  *  #NAN = NPY_NANF, NPY_NAN#
  */
-static @vtype@
+NPY_FINLINE @vtype@
 avx512_hadd_@vsub@(const @vtype@ x)
 {
     return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
 }
 
-static @vtype@
+NPY_FINLINE @vtype@
 avx512_hsub_@vsub@(const @vtype@ x)
 {
     return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));
 }
-static NPY_INLINE @vtype@
+NPY_FINLINE @vtype@
 avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)
 {
     // x1 = r1, i1
diff --git a/numpy/core/src/umath/loops_exponent_log.dispatch.c.src b/numpy/core/src/umath/loops_exponent_log.dispatch.c.src
@@ -45,19 +45,19 @@
 
 #ifdef SIMD_AVX2_FMA3
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_full_load_mask_ps(void)
 {
     return _mm256_set1_ps(-1.0);
 }
 
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
 fma_get_full_load_mask_pd(void)
 {
     return _mm256_castpd_si256(_mm256_set1_pd(-1.0));
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
 {
     float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
@@ -66,15 +66,15 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)
     return _mm256_loadu_ps(addr);
 }
 
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
 fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)
 {
     npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};
     npy_int* addr = maskint + 2*num_lanes - 2*num_elem;
     return _mm256_loadu_si256((__m256i*) addr);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_masked_gather_ps(__m256 src,
                      npy_float* addr,
                      __m256i vindex,
@@ -83,7 +83,7 @@ fma_masked_gather_ps(__m256 src,
     return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);
 }
 
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
 fma_masked_gather_pd(__m256d src,
                      npy_double* addr,
                      __m128i vindex,
@@ -92,49 +92,49 @@ fma_masked_gather_pd(__m256d src,
     return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_masked_load_ps(__m256 mask, npy_float* addr)
 {
     return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));
 }
 
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
 fma_masked_load_pd(__m256i mask, npy_double* addr)
 {
     return _mm256_maskload_pd(addr, mask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)
 {
     return _mm256_blendv_ps(x, val, mask);
 }
 
-static NPY_INLINE __m256d
+NPY_FINLINE __m256d
 fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)
 {
     return _mm256_blendv_pd(x, val, mask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_blend(__m256 x, __m256 y, __m256 ymask)
 {
     return _mm256_blendv_ps(x, y, ymask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_invert_mask_ps(__m256 ymask)
 {
     return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));
 }
 
-static NPY_INLINE __m256i
+NPY_FINLINE __m256i
 fma_invert_mask_pd(__m256i ymask)
 {
     return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_exponent(__m256 x)
 {
     /*
@@ -165,7 +165,7 @@ fma_get_exponent(__m256 x)
     return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_get_mantissa(__m256 x)
 {
     /*
@@ -195,7 +195,7 @@ fma_get_mantissa(__m256 x)
                         _mm256_castps_si256(x), mantissa_bits), exp_126_bits));
 }
 
-static NPY_INLINE __m256
+NPY_FINLINE __m256
 fma_scalef_ps(__m256 poly, __m256 quadrant)
 {
     /*
@@ -238,31 +238,31 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)
 
 #ifdef SIMD_AVX512F
 
-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16
 avx512_get_full_load_mask_ps(void)
 {
     return 0xFFFF;
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_get_full_load_mask_pd(void)
 {
     return 0xFF;
 }
 
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
 avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x0001 << num_elem) - 0x0001;
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)
 {
     return (0x01 << num_elem) - 0x01;
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_masked_gather_ps(__m512 src,
                         npy_float* addr,
                         __m512i vindex,
@@ -271,7 +271,7 @@ avx512_masked_gather_ps(__m512 src,
     return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_masked_gather_pd(__m512d src,
                         npy_double* addr,
                         __m256i vindex,
@@ -280,67 +280,67 @@ avx512_masked_gather_pd(__m512d src,
     return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_masked_load_ps(__mmask16 mask, npy_float* addr)
 {
     return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_masked_load_pd(__mmask8 mask, npy_double* addr)
 {
     return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)
 {
     return _mm512_mask_blend_ps(mask, x, val);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)
 {
     return _mm512_mask_blend_pd(mask, x, val);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_blend(__m512 x, __m512 y, __mmask16 ymask)
 {
     return _mm512_mask_mov_ps(x, ymask, y);
 }
 
-static NPY_INLINE __mmask16
+NPY_FINLINE __mmask16
 avx512_invert_mask_ps(__mmask16 ymask)
 {
     return _mm512_knot(ymask);
 }
 
-static NPY_INLINE __mmask8
+NPY_FINLINE __mmask8
 avx512_invert_mask_pd(__mmask8 ymask)
 {
     return _mm512_knot(ymask);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_get_exponent(__m512 x)
 {
     return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_get_mantissa(__m512 x)
 {
     return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);
 }
 
-static NPY_INLINE __m512
+NPY_FINLINE __m512
 avx512_scalef_ps(__m512 poly, __m512 quadrant)
 {
     return _mm512_scalef_ps(poly, quadrant);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_permute_x4var_pd(__m512d t0,
                         __m512d t1,
                         __m512d t2,
@@ -355,7 +355,7 @@ avx512_permute_x4var_pd(__m512d t0,
     return _mm512_mask_blend_pd(lut_mask, res1, res2);
 }
 
-static NPY_INLINE __m512d
+NPY_FINLINE __m512d
 avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
                         __m512d t4, __m512d t5, __m512d t6, __m512d t7,
                         __m512i index)
@@ -401,7 +401,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
  * 3) x* = x - y*c3
  * c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision
  */
-static NPY_INLINE @vtype@
+NPY_FINLINE @vtype@
 simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)
 {
     @vtype@ reduced_x = @fmadd@(y, c1, x);
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src

Original file line number	Diff line number	Diff line change
`@@ -565,36 +565,36 @@ NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@kind@)`
`565`	`565`	`#endif`
`566`	`566`
`567`	`567`	`#ifdef AVX512F_NOMSVC`
`568`		`-static NPY_INLINE __mmask16`
	`568`	`+NPY_FINLINE __mmask16`
`569`	`569`	`avx512_get_full_load_mask_ps(void)`
`570`	`570`	`{`
`571`	`571`	`return 0xFFFF;`
`572`	`572`	`}`
`573`	`573`
`574`		`-static NPY_INLINE __mmask8`
	`574`	`+NPY_FINLINE __mmask8`
`575`	`575`	`avx512_get_full_load_mask_pd(void)`
`576`	`576`	`{`
`577`	`577`	`return 0xFF;`
`578`	`578`	`}`
`579`		`-static NPY_INLINE __m512`
	`579`	`+NPY_FINLINE __m512`
`580`	`580`	`avx512_masked_load_ps(__mmask16 mask, npy_float* addr)`
`581`	`581`	`{`
`582`	`582`	`return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);`
`583`	`583`	`}`
`584`	`584`
`585`		`-static NPY_INLINE __m512d`
	`585`	`+NPY_FINLINE __m512d`
`586`	`586`	`avx512_masked_load_pd(__mmask8 mask, npy_double* addr)`
`587`	`587`	`{`
`588`	`588`	`return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);`
`589`	`589`	`}`
`590`	`590`
`591`		`-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16`
	`591`	`+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16`
`592`	`592`	`avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)`
`593`	`593`	`{`
`594`	`594`	`return (0x0001 << num_elem) - 0x0001;`
`595`	`595`	`}`
`596`	`596`
`597`		`-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8`
	`597`	`+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask8`
`598`	`598`	`avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)`
`599`	`599`	`{`
`600`	`600`	`return (0x01 << num_elem) - 0x01;`
`@@ -613,18 +613,18 @@ avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem`
`613`	`613`	`* #INF = NPY_INFINITYF, NPY_INFINITY#`
`614`	`614`	`* #NAN = NPY_NANF, NPY_NAN#`
`615`	`615`	`*/`
`616`		`-static @vtype@`
	`616`	`+NPY_FINLINE @vtype@`
`617`	`617`	`avx512_hadd_@vsub@(const @vtype@ x)`
`618`	`618`	`{`
`619`	`619`	`return _mm512_add_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));`
`620`	`620`	`}`
`621`	`621`
`622`		`-static @vtype@`
	`622`	`+NPY_FINLINE @vtype@`
`623`	`623`	`avx512_hsub_@vsub@(const @vtype@ x)`
`624`	`624`	`{`
`625`	`625`	`return _mm512_sub_@vsub@(x, _mm512_permute_@vsub@(x, @perm_@));`
`626`	`626`	`}`
`627`		`-static NPY_INLINE @vtype@`
	`627`	`+NPY_FINLINE @vtype@`
`628`	`628`	`avx512_cmul_@vsub@(@vtype@ x1, @vtype@ x2)`
`629`	`629`	`{`
`630`	`630`	`// x1 = r1, i1`
Original file line number	Diff line number	Diff line change
`@@ -45,19 +45,19 @@`
`45`	`45`
`46`	`46`	`#ifdef SIMD_AVX2_FMA3`
`47`	`47`
`48`		`-static NPY_INLINE __m256`
	`48`	`+NPY_FINLINE __m256`
`49`	`49`	`fma_get_full_load_mask_ps(void)`
`50`	`50`	`{`
`51`	`51`	`return _mm256_set1_ps(-1.0);`
`52`	`52`	`}`
`53`	`53`
`54`		`-static NPY_INLINE __m256i`
	`54`	`+NPY_FINLINE __m256i`
`55`	`55`	`fma_get_full_load_mask_pd(void)`
`56`	`56`	`{`
`57`	`57`	`return _mm256_castpd_si256(_mm256_set1_pd(-1.0));`
`58`	`58`	`}`
`59`	`59`
`60`		`-static NPY_INLINE __m256`
	`60`	`+NPY_FINLINE __m256`
`61`	`61`	`fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)`
`62`	`62`	`{`
`63`	`63`	`float maskint[16] = {-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,`
`@@ -66,15 +66,15 @@ fma_get_partial_load_mask_ps(const npy_int num_elem, const npy_int num_lanes)`
`66`	`66`	`return _mm256_loadu_ps(addr);`
`67`	`67`	`}`
`68`	`68`
`69`		`-static NPY_INLINE __m256i`
	`69`	`+NPY_FINLINE __m256i`
`70`	`70`	`fma_get_partial_load_mask_pd(const npy_int num_elem, const npy_int num_lanes)`
`71`	`71`	`{`
`72`	`72`	`npy_int maskint[16] = {-1,-1,-1,-1,-1,-1,-1,-1,1,1,1,1,1,1,1,1};`
`73`	`73`	`npy_int* addr = maskint + 2num_lanes - 2num_elem;`
`74`	`74`	`return _mm256_loadu_si256((__m256i*) addr);`
`75`	`75`	`}`
`76`	`76`
`77`		`-static NPY_INLINE __m256`
	`77`	`+NPY_FINLINE __m256`
`78`	`78`	`fma_masked_gather_ps(__m256 src,`
`79`	`79`	`npy_float* addr,`
`80`	`80`	`__m256i vindex,`
`@@ -83,7 +83,7 @@ fma_masked_gather_ps(__m256 src,`
`83`	`83`	`return _mm256_mask_i32gather_ps(src, addr, vindex, mask, 4);`
`84`	`84`	`}`
`85`	`85`
`86`		`-static NPY_INLINE __m256d`
	`86`	`+NPY_FINLINE __m256d`
`87`	`87`	`fma_masked_gather_pd(__m256d src,`
`88`	`88`	`npy_double* addr,`
`89`	`89`	`__m128i vindex,`
`@@ -92,49 +92,49 @@ fma_masked_gather_pd(__m256d src,`
`92`	`92`	`return _mm256_mask_i32gather_pd(src, addr, vindex, mask, 8);`
`93`	`93`	`}`
`94`	`94`
`95`		`-static NPY_INLINE __m256`
	`95`	`+NPY_FINLINE __m256`
`96`	`96`	`fma_masked_load_ps(__m256 mask, npy_float* addr)`
`97`	`97`	`{`
`98`	`98`	`return _mm256_maskload_ps(addr, _mm256_cvtps_epi32(mask));`
`99`	`99`	`}`
`100`	`100`
`101`		`-static NPY_INLINE __m256d`
	`101`	`+NPY_FINLINE __m256d`
`102`	`102`	`fma_masked_load_pd(__m256i mask, npy_double* addr)`
`103`	`103`	`{`
`104`	`104`	`return _mm256_maskload_pd(addr, mask);`
`105`	`105`	`}`
`106`	`106`
`107`		`-static NPY_INLINE __m256`
	`107`	`+NPY_FINLINE __m256`
`108`	`108`	`fma_set_masked_lanes_ps(__m256 x, __m256 val, __m256 mask)`
`109`	`109`	`{`
`110`	`110`	`return _mm256_blendv_ps(x, val, mask);`
`111`	`111`	`}`
`112`	`112`
`113`		`-static NPY_INLINE __m256d`
	`113`	`+NPY_FINLINE __m256d`
`114`	`114`	`fma_set_masked_lanes_pd(__m256d x, __m256d val, __m256d mask)`
`115`	`115`	`{`
`116`	`116`	`return _mm256_blendv_pd(x, val, mask);`
`117`	`117`	`}`
`118`	`118`
`119`		`-static NPY_INLINE __m256`
	`119`	`+NPY_FINLINE __m256`
`120`	`120`	`fma_blend(__m256 x, __m256 y, __m256 ymask)`
`121`	`121`	`{`
`122`	`122`	`return _mm256_blendv_ps(x, y, ymask);`
`123`	`123`	`}`
`124`	`124`
`125`		`-static NPY_INLINE __m256`
	`125`	`+NPY_FINLINE __m256`
`126`	`126`	`fma_invert_mask_ps(__m256 ymask)`
`127`	`127`	`{`
`128`	`128`	`return _mm256_andnot_ps(ymask, _mm256_set1_ps(-1.0));`
`129`	`129`	`}`
`130`	`130`
`131`		`-static NPY_INLINE __m256i`
	`131`	`+NPY_FINLINE __m256i`
`132`	`132`	`fma_invert_mask_pd(__m256i ymask)`
`133`	`133`	`{`
`134`	`134`	`return _mm256_andnot_si256(ymask, _mm256_set1_epi32(0xFFFFFFFF));`
`135`	`135`	`}`
`136`	`136`
`137`		`-static NPY_INLINE __m256`
	`137`	`+NPY_FINLINE __m256`
`138`	`138`	`fma_get_exponent(__m256 x)`
`139`	`139`	`{`
`140`	`140`	`/*`
`@@ -165,7 +165,7 @@ fma_get_exponent(__m256 x)`
`165`	`165`	`return _mm256_blendv_ps(exp, denorm_exp, denormal_mask);`
`166`	`166`	`}`
`167`	`167`
`168`		`-static NPY_INLINE __m256`
	`168`	`+NPY_FINLINE __m256`
`169`	`169`	`fma_get_mantissa(__m256 x)`
`170`	`170`	`{`
`171`	`171`	`/*`
`@@ -195,7 +195,7 @@ fma_get_mantissa(__m256 x)`
`195`	`195`	`_mm256_castps_si256(x), mantissa_bits), exp_126_bits));`
`196`	`196`	`}`
`197`	`197`
`198`		`-static NPY_INLINE __m256`
	`198`	`+NPY_FINLINE __m256`
`199`	`199`	`fma_scalef_ps(__m256 poly, __m256 quadrant)`
`200`	`200`	`{`
`201`	`201`	`/*`
`@@ -238,31 +238,31 @@ fma_scalef_ps(__m256 poly, __m256 quadrant)`
`238`	`238`
`239`	`239`	`#ifdef SIMD_AVX512F`
`240`	`240`
`241`		`-static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16`
	`241`	`+NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __mmask16`
`242`	`242`	`avx512_get_full_load_mask_ps(void)`
`243`	`243`	`{`
`244`	`244`	`return 0xFFFF;`
`245`	`245`	`}`
`246`	`246`
`247`		`-static NPY_INLINE __mmask8`
	`247`	`+NPY_FINLINE __mmask8`
`248`	`248`	`avx512_get_full_load_mask_pd(void)`
`249`	`249`	`{`
`250`	`250`	`return 0xFF;`
`251`	`251`	`}`
`252`	`252`
`253`		`-static NPY_INLINE __mmask16`
	`253`	`+NPY_FINLINE __mmask16`
`254`	`254`	`avx512_get_partial_load_mask_ps(const npy_int num_elem, const npy_int total_elem)`
`255`	`255`	`{`
`256`	`256`	`return (0x0001 << num_elem) - 0x0001;`
`257`	`257`	`}`
`258`	`258`
`259`		`-static NPY_INLINE __mmask8`
	`259`	`+NPY_FINLINE __mmask8`
`260`	`260`	`avx512_get_partial_load_mask_pd(const npy_int num_elem, const npy_int total_elem)`
`261`	`261`	`{`
`262`	`262`	`return (0x01 << num_elem) - 0x01;`
`263`	`263`	`}`
`264`	`264`
`265`		`-static NPY_INLINE __m512`
	`265`	`+NPY_FINLINE __m512`
`266`	`266`	`avx512_masked_gather_ps(__m512 src,`
`267`	`267`	`npy_float* addr,`
`268`	`268`	`__m512i vindex,`
`@@ -271,7 +271,7 @@ avx512_masked_gather_ps(__m512 src,`
`271`	`271`	`return _mm512_mask_i32gather_ps(src, kmask, vindex, addr, 4);`
`272`	`272`	`}`
`273`	`273`
`274`		`-static NPY_INLINE __m512d`
	`274`	`+NPY_FINLINE __m512d`
`275`	`275`	`avx512_masked_gather_pd(__m512d src,`
`276`	`276`	`npy_double* addr,`
`277`	`277`	`__m256i vindex,`
`@@ -280,67 +280,67 @@ avx512_masked_gather_pd(__m512d src,`
`280`	`280`	`return _mm512_mask_i32gather_pd(src, kmask, vindex, addr, 8);`
`281`	`281`	`}`
`282`	`282`
`283`		`-static NPY_INLINE __m512`
	`283`	`+NPY_FINLINE __m512`
`284`	`284`	`avx512_masked_load_ps(__mmask16 mask, npy_float* addr)`
`285`	`285`	`{`
`286`	`286`	`return _mm512_maskz_loadu_ps(mask, (__m512 *)addr);`
`287`	`287`	`}`
`288`	`288`
`289`		`-static NPY_INLINE __m512d`
	`289`	`+NPY_FINLINE __m512d`
`290`	`290`	`avx512_masked_load_pd(__mmask8 mask, npy_double* addr)`
`291`	`291`	`{`
`292`	`292`	`return _mm512_maskz_loadu_pd(mask, (__m512d *)addr);`
`293`	`293`	`}`
`294`	`294`
`295`		`-static NPY_INLINE __m512`
	`295`	`+NPY_FINLINE __m512`
`296`	`296`	`avx512_set_masked_lanes_ps(__m512 x, __m512 val, __mmask16 mask)`
`297`	`297`	`{`
`298`	`298`	`return _mm512_mask_blend_ps(mask, x, val);`
`299`	`299`	`}`
`300`	`300`
`301`		`-static NPY_INLINE __m512d`
	`301`	`+NPY_FINLINE __m512d`
`302`	`302`	`avx512_set_masked_lanes_pd(__m512d x, __m512d val, __mmask8 mask)`
`303`	`303`	`{`
`304`	`304`	`return _mm512_mask_blend_pd(mask, x, val);`
`305`	`305`	`}`
`306`	`306`
`307`		`-static NPY_INLINE __m512`
	`307`	`+NPY_FINLINE __m512`
`308`	`308`	`avx512_blend(__m512 x, __m512 y, __mmask16 ymask)`
`309`	`309`	`{`
`310`	`310`	`return _mm512_mask_mov_ps(x, ymask, y);`
`311`	`311`	`}`
`312`	`312`
`313`		`-static NPY_INLINE __mmask16`
	`313`	`+NPY_FINLINE __mmask16`
`314`	`314`	`avx512_invert_mask_ps(__mmask16 ymask)`
`315`	`315`	`{`
`316`	`316`	`return _mm512_knot(ymask);`
`317`	`317`	`}`
`318`	`318`
`319`		`-static NPY_INLINE __mmask8`
	`319`	`+NPY_FINLINE __mmask8`
`320`	`320`	`avx512_invert_mask_pd(__mmask8 ymask)`
`321`	`321`	`{`
`322`	`322`	`return _mm512_knot(ymask);`
`323`	`323`	`}`
`324`	`324`
`325`		`-static NPY_INLINE __m512`
	`325`	`+NPY_FINLINE __m512`
`326`	`326`	`avx512_get_exponent(__m512 x)`
`327`	`327`	`{`
`328`	`328`	`return _mm512_add_ps(_mm512_getexp_ps(x), _mm512_set1_ps(1.0f));`
`329`	`329`	`}`
`330`	`330`
`331`		`-static NPY_INLINE __m512`
	`331`	`+NPY_FINLINE __m512`
`332`	`332`	`avx512_get_mantissa(__m512 x)`
`333`	`333`	`{`
`334`	`334`	`return _mm512_getmant_ps(x, _MM_MANT_NORM_p5_1, _MM_MANT_SIGN_src);`
`335`	`335`	`}`
`336`	`336`
`337`		`-static NPY_INLINE __m512`
	`337`	`+NPY_FINLINE __m512`
`338`	`338`	`avx512_scalef_ps(__m512 poly, __m512 quadrant)`
`339`	`339`	`{`
`340`	`340`	`return _mm512_scalef_ps(poly, quadrant);`
`341`	`341`	`}`
`342`	`342`
`343`		`-static NPY_INLINE __m512d`
	`343`	`+NPY_FINLINE __m512d`
`344`	`344`	`avx512_permute_x4var_pd(__m512d t0,`
`345`	`345`	`__m512d t1,`
`346`	`346`	`__m512d t2,`
`@@ -355,7 +355,7 @@ avx512_permute_x4var_pd(__m512d t0,`
`355`	`355`	`return _mm512_mask_blend_pd(lut_mask, res1, res2);`
`356`	`356`	`}`
`357`	`357`
`358`		`-static NPY_INLINE __m512d`
	`358`	`+NPY_FINLINE __m512d`
`359`	`359`	`avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,`
`360`	`360`	`__m512d t4, __m512d t5, __m512d t6, __m512d t7,`
`361`	`361`	`__m512i index)`
`@@ -401,7 +401,7 @@ avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,`
`401`	`401`	`* 3) x* = x - y*c3`
`402`	`402`	`* c1, c2 are exact floating points, c3 = C - c1 - c2 simulates higher precision`
`403`	`403`	`*/`
`404`		`-static NPY_INLINE @vtype@`
	`404`	`+NPY_FINLINE @vtype@`
`405`	`405`	`simd_range_reduction(@vtype@ x, @vtype@ y, @vtype@ c1, @vtype@ c2, @vtype@ c3)`
`406`	`406`	`{`
`407`	`407`	`@vtype@ reduced_x = @fmadd@(y, c1, x);`