|
15 | 15 | #define __AMXFP8INTRIN_H |
16 | 16 | #ifdef __x86_64__ |
17 | 17 |
|
18 | | - |
19 | | -/// Compute dot-product of brain-float8 (BF8) or hybrid-float8 (HF8) |
20 | | -/// floating-point pairs in tiles \a a and \a b, accumulating the |
21 | | -/// intermediate single-precision (32-bit) floating-point elements with |
22 | | -/// elements in \a dst, and store the 32-bit result back to tile \a dst. |
| 18 | +/// These instructions compute dot product of brain-float8 (BF8) or |
| 19 | +/// hybrid-float8 (HF8) accumulating into a single precision (FP32). The input |
| 20 | +/// elements can be BF8 or HF8. These instructions have three tile operands, one |
| 21 | +/// source/dest accumulator operand, and two source operands, \a a and \a b. The |
| 22 | +/// \a a and \a b operands can be BF8 or HF8 independently, and the source/dest |
| 23 | +/// operand, \a dst is always FP32. |
23 | 24 | /// |
24 | 25 | /// \headerfile <immintrin.h> |
25 | 26 | /// |
26 | 27 | /// \code |
27 | 28 | /// void _tile_dpbf8ps (__tile dst, __tile a, __tile b) |
28 | 29 | /// \endcode |
29 | 30 | /// |
30 | | -/// This intrinsic corresponds to the \c TDPBF8PS instruction. |
| 31 | +/// This intrinsic corresponds to the \c TDPBF8PS instruction, which is the dot |
| 32 | +/// product of a BF8 value (\a a) by a BF8 value (\a b) accumulating into a |
| 33 | +/// Single Precision (FP32) source/dest (\a dst). |
31 | 34 | /// |
32 | 35 | /// \param dst |
33 | 36 | /// The destination tile. Max size is 1024 Bytes. |
34 | 37 | /// \param a |
35 | 38 | /// The 1st source tile. Max size is 1024 Bytes. |
36 | 39 | /// \param b |
37 | 40 | /// The 2nd source tile. Max size is 1024 Bytes. |
38 | | -#define _tile_dpbf8ps __builtin_ia32_tdpbf8ps |
| 41 | +#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b)) |
39 | 42 |
|
40 | 43 | /// \code |
41 | 44 | /// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b) |
42 | 45 | /// \endcode |
43 | 46 | /// |
44 | | -/// This intrinsic corresponds to the \c TDPBHF8PS instruction. |
| 47 | +/// This intrinsic corresponds to the \c TDPBHF8PS instruction, which is the dot |
| 48 | +/// product of a BF8 value (\a a) by an HF8 value (\a b) accumulating into a |
| 49 | +/// Single Precision (FP32) source/dest (\a dst). |
45 | 50 | /// |
46 | 51 | /// \param dst |
47 | 52 | /// The destination tile. Max size is 1024 Bytes. |
48 | 53 | /// \param a |
49 | 54 | /// The 1st source tile. Max size is 1024 Bytes. |
50 | 55 | /// \param b |
51 | 56 | /// The 2nd source tile. Max size is 1024 Bytes. |
52 | | -#define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps |
| 57 | +#define _tile_dpbhf8ps(dst, a, b) __builtin_ia32_tdpbhf8ps((dst), (a), (b)) |
53 | 58 |
|
54 | 59 | /// \code |
55 | 60 | /// void _tile_dphbf8ps (__tile dst, __tile a, __tile b) |
56 | 61 | /// \endcode |
57 | 62 | /// |
58 | | -/// This intrinsic corresponds to the \c TDPHBF8PS instruction. |
| 63 | +/// This intrinsic corresponds to the \c TDPHBF8PS instruction, which is the dot |
| 64 | +/// product of an HF8 value (\a a) by a BF8 value (\a b) accumulating into a |
| 65 | +/// Single Precision (FP32) source/dest (\a dst). |
59 | 66 | /// |
60 | 67 | /// \param dst |
61 | 68 | /// The destination tile. Max size is 1024 Bytes. |
62 | 69 | /// \param a |
63 | 70 | /// The 1st source tile. Max size is 1024 Bytes. |
64 | 71 | /// \param b |
65 | 72 | /// The 2nd source tile. Max size is 1024 Bytes. |
66 | | -#define _tile_dphbf8ps __builtin_ia32_tdphbf8ps |
| 73 | +#define _tile_dphbf8ps(dst, a, b) __builtin_ia32_tdphbf8ps((dst), (a), (b)) |
67 | 74 |
|
68 | 75 | /// \code |
69 | 76 | /// void _tile_dphf8ps (__tile dst, __tile a, __tile b) |
70 | 77 | /// \endcode |
71 | 78 | /// |
72 | | -/// This intrinsic corresponds to the \c TDPHF8PS instruction. |
| 79 | +/// This intrinsic corresponds to the \c TDPHF8PS instruction, which is the dot |
| 80 | +/// product of an HF8 value (\a a) by an HF8 value (\a b) accumulating into a |
| 81 | +/// Single Precision (FP32) source/dest (\a dst). |
73 | 82 | /// |
74 | 83 | /// \param dst |
75 | 84 | /// The destination tile. Max size is 1024 Bytes. |
76 | 85 | /// \param a |
77 | 86 | /// The 1st source tile. Max size is 1024 Bytes. |
78 | 87 | /// \param b |
79 | 88 | /// The 2nd source tile. Max size is 1024 Bytes. |
80 | | -#define _tile_dphf8ps __builtin_ia32_tdphf8ps |
| 89 | +#define _tile_dphf8ps(dst, a, b) __builtin_ia32_tdphf8ps((dst), (a), (b)) |
81 | 90 |
|
82 | 91 | #endif /* __x86_64__ */ |
83 | 92 | #endif /* __AMXFP8INTRIN_H */ |
0 commit comments