|
190 | 190 | # else |
191 | 191 | # define OIIO_AVX512PF_ENABLED 0 |
192 | 192 | # endif |
193 | | -# if defined(__AVX512ER__) |
194 | | -# define OIIO_AVX512ER_ENABLED 1 /* Exponential & reciprocal */ |
195 | | -# else |
196 | | -# define OIIO_AVX512ER_ENABLED 0 |
197 | | -# endif |
198 | 193 | # if defined(__AVX512CD__) |
199 | 194 | # define OIIO_AVX512CD_ENABLED 1 /* Conflict detection */ |
200 | 195 | # else |
|
215 | 210 | # define OIIO_AVX512VL_ENABLED 0 |
216 | 211 | # define OIIO_AVX512DQ_ENABLED 0 |
217 | 212 | # define OIIO_AVX512PF_ENABLED 0 |
218 | | -# define OIIO_AVX512ER_ENABLED 0 |
219 | 213 | # define OIIO_AVX512CD_ENABLED 0 |
220 | 214 | # define OIIO_AVX512BW_ENABLED 0 |
221 | 215 | #endif |
@@ -6903,7 +6897,7 @@ OIIO_FORCEINLINE void vfloat4::store (float *values, int n) const { |
6903 | 6897 | #if defined(_HALF_H_) || defined(IMATH_HALF_H_) |
6904 | 6898 | OIIO_FORCEINLINE void vfloat4::store (half *values) const { |
6905 | 6899 | #if OIIO_F16C_ENABLED && OIIO_SIMD_SSE |
6906 | | - __m128i h = _mm_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); |
| 6900 | + __m128i h = _mm_cvtps_ph (m_simd, _MM_FROUND_TO_NEAREST_INT); |
6907 | 6901 | _mm_store_sd ((double *)values, _mm_castsi128_pd(h)); |
6908 | 6902 | #elif OIIO_SIMD_NEON |
6909 | 6903 | float16x4_t f16 = vcvt_f16_f32(m_simd); |
@@ -7627,10 +7621,7 @@ OIIO_FORCEINLINE vfloat4 rsqrt (const vfloat4 &a) |
7627 | 7621 |
|
7628 | 7622 | OIIO_FORCEINLINE vfloat4 rsqrt_fast (const vfloat4 &a) |
7629 | 7623 | { |
7630 | | -#if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED |
7631 | | - // Trickery: in and out of the 512 bit registers to use fast approx rsqrt |
7632 | | - return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC)); |
7633 | | -#elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED |
| 7624 | +#if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED |
7634 | 7625 | // Trickery: in and out of the 512 bit registers to use fast approx rsqrt |
7635 | 7626 | return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a))); |
7636 | 7627 | #elif OIIO_SIMD_SSE |
@@ -8794,7 +8785,7 @@ OIIO_FORCEINLINE void vfloat8::store (float *values, int n) const { |
8794 | 8785 | #if defined(_HALF_H_) || defined(IMATH_HALF_H_) |
8795 | 8786 | OIIO_FORCEINLINE void vfloat8::store (half *values) const { |
8796 | 8787 | #if OIIO_SIMD_AVX && OIIO_F16C_ENABLED |
8797 | | - __m128i h = _mm256_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); |
| 8788 | + __m128i h = _mm256_cvtps_ph (m_simd, _MM_FROUND_TO_NEAREST_INT); |
8798 | 8789 | _mm_storeu_si128 ((__m128i *)values, h); |
8799 | 8790 | #elif OIIO_SIMD_SSE || OIIO_SIMD_NEON |
8800 | 8791 | m_4[0].store(values); |
@@ -9285,10 +9276,7 @@ OIIO_FORCEINLINE vfloat8 rsqrt (const vfloat8 &a) |
9285 | 9276 |
|
9286 | 9277 | OIIO_FORCEINLINE vfloat8 rsqrt_fast (const vfloat8 &a) |
9287 | 9278 | { |
9288 | | -#if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED |
9289 | | - // Trickery: in and out of the 512 bit registers to use fast approx rsqrt |
9290 | | - return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC)); |
9291 | | -#elif OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED |
| 9279 | +#if OIIO_SIMD_AVX >= 512 && OIIO_AVX512VL_ENABLED |
9292 | 9280 | // Trickery: in and out of the 512 bit registers to use fast approx rsqrt |
9293 | 9281 | return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a))); |
9294 | 9282 | #elif OIIO_SIMD_AVX |
@@ -9673,7 +9661,7 @@ OIIO_FORCEINLINE void vfloat16::store (float *values, int n) const { |
9673 | 9661 | #if defined(_HALF_H_) || defined(IMATH_HALF_H_) |
9674 | 9662 | OIIO_FORCEINLINE void vfloat16::store (half *values) const { |
9675 | 9663 | #if OIIO_SIMD_AVX >= 512 |
9676 | | - __m256i h = _mm512_cvtps_ph (m_simd, (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)); |
| 9664 | + __m256i h = _mm512_cvtps_ph (m_simd, _MM_FROUND_TO_NEAREST_INT); |
9677 | 9665 | _mm256_storeu_si256 ((__m256i *)values, h); |
9678 | 9666 | #else |
9679 | 9667 | m_8[0].store (values); |
@@ -10113,9 +10101,7 @@ OIIO_FORCEINLINE vint16 rint (const vfloat16& a) |
10113 | 10101 |
|
10114 | 10102 | OIIO_FORCEINLINE vfloat16 rcp_fast (const vfloat16 &a) |
10115 | 10103 | { |
10116 | | -#if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED |
10117 | | - return _mm512_rcp28_ps(a); |
10118 | | -#elif OIIO_SIMD_AVX >= 512 |
| 10104 | +#if OIIO_SIMD_AVX >= 512 |
10119 | 10105 | vfloat16 r = _mm512_rcp14_ps(a); |
10120 | 10106 | return r * nmadd (r, a, vfloat16(2.0f)); |
10121 | 10107 | #else |
@@ -10146,9 +10132,7 @@ OIIO_FORCEINLINE vfloat16 rsqrt (const vfloat16 &a) |
10146 | 10132 |
|
10147 | 10133 | OIIO_FORCEINLINE vfloat16 rsqrt_fast (const vfloat16 &a) |
10148 | 10134 | { |
10149 | | -#if OIIO_SIMD_AVX >= 512 && OIIO_AVX512ER_ENABLED |
10150 | | - return _mm512_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC); |
10151 | | -#elif OIIO_SIMD_AVX >= 512 |
| 10135 | +#if OIIO_SIMD_AVX >= 512 |
10152 | 10136 | return _mm512_rsqrt14_ps (a); |
10153 | 10137 | #else |
10154 | 10138 | return vfloat16(rsqrt_fast(a.lo()), rsqrt_fast(a.hi())); |
|
0 commit comments