@@ -95,10 +95,10 @@ _mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C)
9595// / 32 bits.
9696// / \returns A 128-bit vector of [4 x float] containing the result in the low
9797// / 32 bits and a copy of \a __A[127:32] in the upper 96 bits.
98- static __inline__ __m128 __DEFAULT_FN_ATTRS128
99- _mm_fmadd_ss (__m128 __A, __m128 __B, __m128 __C)
100- {
101- return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf) __A, (__v4sf)__B, (__v4sf)__C) ;
98+ static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
99+ _mm_fmadd_ss (__m128 __A, __m128 __B, __m128 __C) {
100+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], __B[ 0 ], __C[ 0 ]);
101+ return __A;
102102}
103103
104104// / Computes a scalar multiply-add of the double-precision values in the
@@ -124,10 +124,10 @@ _mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C)
124124// / 64 bits.
125125// / \returns A 128-bit vector of [2 x double] containing the result in the low
126126// / 64 bits and a copy of \a __A[127:64] in the upper 64 bits.
127- static __inline__ __m128d __DEFAULT_FN_ATTRS128
128- _mm_fmadd_sd (__m128d __A, __m128d __B, __m128d __C)
129- {
130- return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df) __A, (__v2df)__B, (__v2df)__C) ;
127+ static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
128+ _mm_fmadd_sd (__m128d __A, __m128d __B, __m128d __C) {
129+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], __B[ 0 ], __C[ 0 ]);
130+ return __A;
131131}
132132
133133// / Computes a multiply-subtract of 128-bit vectors of [4 x float].
@@ -195,10 +195,10 @@ _mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C)
195195// / 32 bits.
196196// / \returns A 128-bit vector of [4 x float] containing the result in the low
197197// / 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
198- static __inline__ __m128 __DEFAULT_FN_ATTRS128
199- _mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C)
200- {
201- return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf) __A, (__v4sf)__B, -(__v4sf)__C) ;
198+ static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
199+ _mm_fmsub_ss (__m128 __A, __m128 __B, __m128 __C) {
200+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], __B[ 0 ], -__C[ 0 ]);
201+ return __A;
202202}
203203
204204// / Computes a scalar multiply-subtract of the double-precision values in
@@ -224,10 +224,10 @@ _mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C)
224224// / 64 bits.
225225// / \returns A 128-bit vector of [2 x double] containing the result in the low
226226// / 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
227- static __inline__ __m128d __DEFAULT_FN_ATTRS128
228- _mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C)
229- {
230- return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df) __A, (__v2df)__B, -(__v2df)__C) ;
227+ static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
228+ _mm_fmsub_sd (__m128d __A, __m128d __B, __m128d __C) {
229+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], __B[ 0 ], -__C[ 0 ]);
230+ return __A;
231231}
232232
233233// / Computes a negated multiply-add of 128-bit vectors of [4 x float].
@@ -295,10 +295,10 @@ _mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C)
295295// / 32 bits.
296296// / \returns A 128-bit vector of [4 x float] containing the result in the low
297297// / 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
298- static __inline__ __m128 __DEFAULT_FN_ATTRS128
299- _mm_fnmadd_ss (__m128 __A, __m128 __B, __m128 __C)
300- {
301- return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf) __A, -(__v4sf)__B, (__v4sf)__C) ;
298+ static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
299+ _mm_fnmadd_ss (__m128 __A, __m128 __B, __m128 __C) {
300+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], -__B[ 0 ], __C[ 0 ]);
301+ return __A;
302302}
303303
304304// / Computes a scalar negated multiply-add of the double-precision values
@@ -324,10 +324,10 @@ _mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C)
324324// / 64 bits.
325325// / \returns A 128-bit vector of [2 x double] containing the result in the low
326326// / 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
327- static __inline__ __m128d __DEFAULT_FN_ATTRS128
328- _mm_fnmadd_sd (__m128d __A, __m128d __B, __m128d __C)
329- {
330- return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df) __A, -(__v2df)__B, (__v2df)__C) ;
327+ static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
328+ _mm_fnmadd_sd (__m128d __A, __m128d __B, __m128d __C) {
329+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], -__B[ 0 ], __C[ 0 ]);
330+ return __A;
331331}
332332
333333// / Computes a negated multiply-subtract of 128-bit vectors of [4 x float].
@@ -395,10 +395,10 @@ _mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C)
395395// / 32 bits.
396396// / \returns A 128-bit vector of [4 x float] containing the result in the low
397397// / 32 bits, and a copy of \a __A[127:32] in the upper 96 bits.
398- static __inline__ __m128 __DEFAULT_FN_ATTRS128
399- _mm_fnmsub_ss (__m128 __A, __m128 __B, __m128 __C)
400- {
401- return (__m128) __builtin_ia32_vfmaddss3 ((__v4sf) __A, -(__v4sf)__B, -(__v4sf)__C) ;
398+ static __inline__ __m128 __DEFAULT_FN_ATTRS128_CONSTEXPR
399+ _mm_fnmsub_ss (__m128 __A, __m128 __B, __m128 __C) {
400+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], -__B[ 0 ], -__C[ 0 ]);
401+ return __A;
402402}
403403
404404// / Computes a scalar negated multiply-subtract of the double-precision
@@ -424,10 +424,10 @@ _mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C)
424424// / 64 bits.
425425// / \returns A 128-bit vector of [2 x double] containing the result in the low
426426// / 64 bits, and a copy of \a __A[127:64] in the upper 64 bits.
427- static __inline__ __m128d __DEFAULT_FN_ATTRS128
428- _mm_fnmsub_sd (__m128d __A, __m128d __B, __m128d __C)
429- {
430- return (__m128d) __builtin_ia32_vfmaddsd3 ((__v2df) __A, -(__v2df)__B, -(__v2df)__C) ;
427+ static __inline__ __m128d __DEFAULT_FN_ATTRS128_CONSTEXPR
428+ _mm_fnmsub_sd (__m128d __A, __m128d __B, __m128d __C) {
429+ __A[ 0 ] = __builtin_elementwise_fma (__A[ 0 ], -__B[ 0 ], -__C[ 0 ]);
430+ return __A;
431431}
432432
433433// / Computes a multiply with alternating add/subtract of 128-bit vectors of
0 commit comments