|
37 | 37 |
|
38 | 38 | #endif |
39 | 39 |
|
| 40 | +#if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__)) |
40 | 41 | #define _mm_madd52hi_epu64(X, Y, Z) \ |
41 | 42 | ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y), \ |
42 | 43 | (__v2di)(Z))) |
|
52 | 53 | #define _mm256_madd52lo_epu64(X, Y, Z) \ |
53 | 54 | ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y), \ |
54 | 55 | (__v4di)(Z))) |
| 56 | +#endif |
| 57 | + |
| 58 | +#if defined(__AVX512IFMA__) |
| 59 | +static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
| 60 | +_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) { |
| 61 | + return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y, |
| 62 | + (__v2di)__Z); |
| 63 | +} |
| 64 | + |
| 65 | +static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
| 66 | +_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) { |
| 67 | + return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, |
| 68 | + (__v4di)__Z); |
| 69 | +} |
| 70 | + |
| 71 | +static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
| 72 | +_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) { |
| 73 | + return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, |
| 74 | + (__v2di)__Z); |
| 75 | +} |
| 76 | + |
| 77 | +static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
| 78 | +_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) { |
| 79 | + return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, |
| 80 | + (__v4di)__Z); |
| 81 | +} |
| 82 | +#endif |
55 | 83 |
|
56 | 84 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
57 | 85 | _mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { |
58 | 86 | return (__m128i)__builtin_ia32_selectq_128( |
59 | | - __M, (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), (__v2di)__W); |
| 87 | + __M, (__v2di)__builtin_ia32_vpmadd52huq128(__W, __X, __Y), (__v2di)__W); |
60 | 88 | } |
61 | 89 |
|
62 | 90 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
63 | 91 | _mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { |
64 | 92 | return (__m128i)__builtin_ia32_selectq_128( |
65 | | - __M, (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z), |
| 93 | + __M, (__v2di)__builtin_ia32_vpmadd52huq128(__X, __Y, __Z), |
66 | 94 | (__v2di)_mm_setzero_si128()); |
67 | 95 | } |
68 | 96 |
|
69 | 97 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52hi_epu64( |
70 | 98 | __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { |
71 | 99 | return (__m256i)__builtin_ia32_selectq_256( |
72 | | - __M, (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), (__v4di)__W); |
| 100 | + __M, (__v4di)__builtin_ia32_vpmadd52huq256(__W, __X, __Y), (__v4di)__W); |
73 | 101 | } |
74 | 102 |
|
75 | 103 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52hi_epu64( |
76 | 104 | __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { |
77 | 105 | return (__m256i)__builtin_ia32_selectq_256( |
78 | | - __M, (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z), |
| 106 | + __M, (__v4di)__builtin_ia32_vpmadd52huq256(__X, __Y, __Z), |
79 | 107 | (__v4di)_mm256_setzero_si256()); |
80 | 108 | } |
81 | 109 |
|
82 | 110 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
83 | 111 | _mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) { |
84 | 112 | return (__m128i)__builtin_ia32_selectq_128( |
85 | | - __M, (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), (__v2di)__W); |
| 113 | + __M, (__v2di)__builtin_ia32_vpmadd52luq128(__W, __X, __Y), (__v2di)__W); |
86 | 114 | } |
87 | 115 |
|
88 | 116 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
89 | 117 | _mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) { |
90 | 118 | return (__m128i)__builtin_ia32_selectq_128( |
91 | | - __M, (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z), |
| 119 | + __M, (__v2di)__builtin_ia32_vpmadd52luq128(__X, __Y, __Z), |
92 | 120 | (__v2di)_mm_setzero_si128()); |
93 | 121 | } |
94 | 122 |
|
95 | 123 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52lo_epu64( |
96 | 124 | __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) { |
97 | 125 | return (__m256i)__builtin_ia32_selectq_256( |
98 | | - __M, (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), (__v4di)__W); |
| 126 | + __M, (__v4di)__builtin_ia32_vpmadd52luq256(__W, __X, __Y), (__v4di)__W); |
99 | 127 | } |
100 | 128 |
|
101 | 129 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52lo_epu64( |
102 | 130 | __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) { |
103 | 131 | return (__m256i)__builtin_ia32_selectq_256( |
104 | | - __M, (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z), |
| 132 | + __M, (__v4di)__builtin_ia32_vpmadd52luq256(__X, __Y, __Z), |
105 | 133 | (__v4di)_mm256_setzero_si256()); |
106 | 134 | } |
107 | 135 |
|
|
0 commit comments