@@ -178,8 +178,7 @@ _mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
178
178
static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph (__m256 __A ,
179
179
__m256 __B ) {
180
180
return (__m256h )__builtin_ia32_vcvt2ps2phx256_mask (
181
- (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )(-1 ),
182
- _MM_FROUND_CUR_DIRECTION );
181
+ (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )(-1 ));
183
182
}
184
183
185
184
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -223,8 +222,7 @@ static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
223
222
static __inline__ __m256h __DEFAULT_FN_ATTRS256
224
223
_mm256_mask_cvtx2ps_ph (__m256h __W , __mmask16 __U , __m256 __A , __m256 __B ) {
225
224
return (__m256h )__builtin_ia32_vcvt2ps2phx256_mask (
226
- (__v8sf )__A , (__v8sf )__B , (__v16hf )__W , (__mmask16 )__U ,
227
- _MM_FROUND_CUR_DIRECTION );
225
+ (__v8sf )__A , (__v8sf )__B , (__v16hf )__W , (__mmask16 )__U );
228
226
}
229
227
230
228
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
@@ -266,142 +264,9 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
266
264
static __inline__ __m256h __DEFAULT_FN_ATTRS256
267
265
_mm256_maskz_cvtx2ps_ph (__mmask16 __U , __m256 __A , __m256 __B ) {
268
266
return (__m256h )__builtin_ia32_vcvt2ps2phx256_mask (
269
- (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )__U ,
270
- _MM_FROUND_CUR_DIRECTION );
267
+ (__v8sf )__A , (__v8sf )__B , (__v16hf )_mm256_setzero_ph (), (__mmask16 )__U );
271
268
}
272
269
273
- /// Convert two 256-bit vectors, \a __A and \a __B, containing packed
274
- /// single-precision (32-bit) floating-point elements to a 256-bit vector
275
- /// containing FP16 elements. Rounding mode \a __R needs to be provided.
276
- ///
277
- /// \code{.operation}
278
- /// FOR i := 0 to 15
279
- /// IF i < 8
280
- /// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
281
- /// ELSE
282
- /// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
283
- /// FI
284
- /// ENDFOR
285
- ///
286
- /// dst[MAX:256] := 0
287
- /// \endcode
288
- ///
289
- /// \headerfile <immintrin.h>
290
- ///
291
- /// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
292
- ///
293
- /// \param __A
294
- /// A 256-bit vector of [8 x float].
295
- /// \param __B
296
- /// A 256-bit vector of [8 x float].
297
- /// \param __R
298
- /// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
299
- /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
300
- /// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
301
- /// _MM_FROUND_TO_ZERO.
302
- /// \returns
303
- /// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
304
- /// (converted) elements from \a __B; higher order elements correspond to the
305
- /// (converted) elements from \a __A.
306
- #define _mm256_cvtx_round2ps_ph (__A , __B , __R ) \
307
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
308
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)_mm256_undefined_ph(), \
309
- (__mmask16)(-1), (const int)(__R)))
310
-
311
- /// Convert two 256-bit vectors, \a __A and \a __B, containing packed
312
- /// single-precision (32-bit) floating-point elements to a 256-bit vector
313
- /// containing FP16 elements. Merging mask \a __U is used to determine if given
314
- /// element should be taken from \a __W instead. Rounding mode \a __R needs to
315
- /// be provided.
316
- ///
317
- /// \code{.operation}
318
- /// FOR i := 0 to 15
319
- /// IF __U[i]
320
- /// IF i < 8
321
- /// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
322
- /// ELSE
323
- /// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
324
- /// FI
325
- /// ELSE
326
- /// dst.fp16[i] := __W.fp16[i]
327
- /// FI
328
- /// ENDFOR
329
- ///
330
- /// dst[MAX:256] := 0
331
- /// \endcode
332
- ///
333
- /// \headerfile <immintrin.h>
334
- ///
335
- /// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
336
- ///
337
- /// \param __W
338
- /// A 256-bit vector of [16 x fp16].
339
- /// \param __U
340
- /// A 16-bit merging mask.
341
- /// \param __A
342
- /// A 256-bit vector of [8 x float].
343
- /// \param __B
344
- /// A 256-bit vector of [8 x float].
345
- /// \param __R
346
- /// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
347
- /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
348
- /// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
349
- /// _MM_FROUND_TO_ZERO.
350
- /// \returns
351
- /// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
352
- /// (converted) elements from \a __B; higher order elements correspond to the
353
- /// (converted) elements from \a __A. If corresponding mask bit is not set, then
354
- /// element from \a __W is taken instead.
355
- #define _mm256_mask_cvtx_round2ps_ph (__W , __U , __A , __B , __R ) \
356
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
357
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(__W), (__mmask16)(__U), (const int)(__R)))
358
-
359
- /// Convert two 256-bit vectors, \a __A and \a __B, containing packed
360
- /// single-precision (32-bit) floating-point elements to a 256-bit vector
361
- /// containing FP16 elements. Zeroing mask \a __U is used to determine if given
362
- /// element should be zeroed instead. Rounding mode \a __R needs to be provided.
363
- ///
364
- /// \code{.operation}
365
- /// FOR i := 0 to 15
366
- /// IF __U[i]
367
- /// IF i < 8
368
- /// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
369
- /// ELSE
370
- /// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
371
- /// FI
372
- /// ELSE
373
- /// dst.fp16[i] := 0
374
- /// FI
375
- /// ENDFOR
376
- ///
377
- /// dst[MAX:256] := 0
378
- /// \endcode
379
- ///
380
- /// \headerfile <immintrin.h>
381
- ///
382
- /// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
383
- ///
384
- /// \param __U
385
- /// A 16-bit zeroing mask.
386
- /// \param __A
387
- /// A 256-bit vector of [8 x float].
388
- /// \param __B
389
- /// A 256-bit vector of [8 x float].
390
- /// \param __R
391
- /// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
392
- /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
393
- /// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
394
- /// _MM_FROUND_TO_ZERO.
395
- /// \returns
396
- /// A 256-bit vector of [16 x fp16]. Lower elements correspond to the
397
- /// (converted) elements from \a __B; higher order elements correspond to the
398
- /// (converted) elements from \a __A. If corresponding mask bit is not set,
399
- /// then zero is taken instead.
400
- #define _mm256_maskz_cvtx_round2ps_ph (__U , __A , __B , __R ) \
401
- ((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
402
- (__v8sf)(__A), (__v8sf)(__B), (__v16hf)(_mm256_setzero_ph()), \
403
- (__mmask16)(__U), (const int)(__R)))
404
-
405
270
/// Convert 128-bit vector \a __B containing packed FP16 floating-point elements
406
271
/// to FP8 E5M2 numbers, using conversion biases stored in lower 8 bits of each
407
272
/// 16-bit integer stored in \a __B.
0 commit comments