@@ -178,8 +178,7 @@ _mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
178178static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph (__m256 __A ,
179179 __m256 __B ) {
180180 return (__m256h )__builtin_ia32_vcvt2ps2phx256_mask (
181- (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )(-1 ),
182- _MM_FROUND_CUR_DIRECTION );
181+ (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )(-1 ));
183182}
184183
185184/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -223,8 +222,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
223222static __inline__ __m256h __DEFAULT_FN_ATTRS256
224223_mm256_mask_cvtx2ps_ph (__m256h __W , __mmask16 __U , __m256 __A , __m256 __B ) {
225224 return (__m256h )__builtin_ia32_vcvt2ps2phx256_mask (
226- (__v8sf )__A , (__v8sf )__B , (__v16hf )__W , (__mmask16 )__U ,
227- _MM_FROUND_CUR_DIRECTION );
225+ (__v8sf )__A , (__v8sf )__B , (__v16hf )__W , (__mmask16 )__U );
228226}
229227
230228/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -266,142 +264,9 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
266264static __inline__ __m256h __DEFAULT_FN_ATTRS256
267265_mm256_maskz_cvtx2ps_ph (__mmask16 __U , __m256 __A , __m256 __B ) {
268266 return (__m256h )__builtin_ia32_vcvt2ps2phx256_mask (
269- (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )__U ,
270- _MM_FROUND_CUR_DIRECTION );
267+ (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )__U );
271268}
272269
273- /// Convert two 256-bit vectors, \a __A and \a __B, containing packed
274- /// single-precision (32-bit) floating-point elements to a 256-bit vector
275- /// containing FP16 elements. Rounding mode \a __R needs to be provided.
276- ///
277- /// \code{.operation}
278- /// FOR i := 0 to 15
279- /// IF i < 8
280- /// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
281- /// ELSE
282- /// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
283- /// FI
284- /// ENDFOR
285- ///
286- /// dst[MAX:256] := 0
287- /// \endcode
288- ///
289- /// \headerfile <immintrin.h>
290- ///
291- /// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
292- ///
293- /// \param __A
294- /// A 256-bit vector of [8 x float].
295- /// \param __B
296- /// A 256-bit vector of [8 x float].
297- /// \param __R
298- /// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
299- /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
300- /// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
301- /// _MM_FROUND_TO_ZERO.
302- /// \returns
303- /// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
304- /// (converted) elements from \a __B; higher order elements correspond to the
305- /// (converted) elements from \a __A.
306- #define _mm256_cvtx_round2ps_ph (__A , __B , __R ) \
307- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
308- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)_mm256_undefined_ph(), \
309- (__mmask16)(-1), (const int)(__R)))
310-
311- /// Convert two 256-bit vectors, \a __A and \a __B, containing packed
312- /// single-precision (32-bit) floating-point elements to a 256-bit vector
313- /// containing FP16 elements. Merging mask \a __U is used to determine if given
314- /// element should be taken from \a __W instead. Rounding mode \a __R needs to
315- /// be provided.
316- ///
317- /// \code{.operation}
318- /// FOR i := 0 to 15
319- /// IF __U[i]
320- /// IF i < 8
321- /// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
322- /// ELSE
323- /// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
324- /// FI
325- /// ELSE
326- /// dst.fp16[i] := __W.fp16[i]
327- /// FI
328- /// ENDFOR
329- ///
330- /// dst[MAX:256] := 0
331- /// \endcode
332- ///
333- /// \headerfile <immintrin.h>
334- ///
335- /// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
336- ///
337- /// \param __W
338- /// A 256-bit vector of [16 x fp16].
339- /// \param __U
340- /// A 16-bit merging mask.
341- /// \param __A
342- /// A 256-bit vector of [8 x float].
343- /// \param __B
344- /// A 256-bit vector of [8 x float].
345- /// \param __R
346- /// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
347- /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
348- /// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
349- /// _MM_FROUND_TO_ZERO.
350- /// \returns
351- /// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
352- /// (converted) elements from \a __B; higher order elements correspond to the
353- /// (converted) elements from \a __A. If corresponding mask bit is not set, then
354- /// element from \a __W is taken instead.
355- #define _mm256_mask_cvtx_round2ps_ph (__W , __U , __A , __B , __R ) \
356- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
357- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(__W), (__mmask16)(__U), (const int)(__R)))
358-
359- /// Convert two 256-bit vectors, \a __A and \a __B, containing packed
360- /// single-precision (32-bit) floating-point elements to a 256-bit vector
361- /// containing FP16 elements. Zeroing mask \a __U is used to determine if given
362- /// element should be zeroed instead. Rounding mode \a __R needs to be provided.
363- ///
364- /// \code{.operation}
365- /// FOR i := 0 to 15
366- /// IF __U[i]
367- /// IF i < 8
368- /// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
369- /// ELSE
370- /// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
371- /// FI
372- /// ELSE
373- /// dst.fp16[i] := 0
374- /// FI
375- /// ENDFOR
376- ///
377- /// dst[MAX:256] := 0
378- /// \endcode
379- ///
380- /// \headerfile <immintrin.h>
381- ///
382- /// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
383- ///
384- /// \param __U
385- /// A 16-bit zeroing mask.
386- /// \param __A
387- /// A 256-bit vector of [8 x float].
388- /// \param __B
389- /// A 256-bit vector of [8 x float].
390- /// \param __R
391- /// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
392- /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
393- /// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
394- /// _MM_FROUND_TO_ZERO.
395- /// \returns
396- /// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
397- /// (converted) elements from \a __B; higher order elements correspond to the
398- /// (converted) elements from \a __A. If corresponding mask bit is not set,
399- /// then zero is taken instead.
400- #define _mm256_maskz_cvtx_round2ps_ph (__U , __A , __B , __R ) \
401- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
402- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(_mm256_setzero_ph()), \
403- (__mmask16)(__U), (const int)(__R)))
404-
405270/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
406271/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
407272/// 16-bit integer stored in \a __B.
0 commit comments