diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index a4acc72fdc37d..a5aa578940483 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -93,13 +93,11 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { } let Features = "sse2" in { - def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def pavgb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">; def pavgw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def packsswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; def packssdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; def packuswb128 : X86Builtin<"_Vector<16, char>(_Vector<8, short>, _Vector<8, short>)">; - def pmulhuw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; def vec_ext_v2di : X86Builtin<"long long int(_Vector<2, long long int>, _Constant int)">; def vec_ext_v4si : X86Builtin<"int(_Vector<4, int>, _Constant int)">; def vec_ext_v4sf : X86Builtin<"float(_Vector<4, float>, _Constant int)">; @@ -107,6 +105,11 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in { def vec_set_v8hi : X86Builtin<"_Vector<8, short>(_Vector<8, short>, short, _Constant int)">; } + let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def pmulhw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">; + def pmulhuw128 : X86Builtin<"_Vector<8, unsigned short>(_Vector<8, unsigned short>, _Vector<8, unsigned short>)">; + } + let Features = "sse3" in { foreach Op = ["addsub", "hadd", "hsub"] in { def Op#ps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>)">; @@ -579,8 +582,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">; def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; - def pmulhuw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; - def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">; def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">; @@ -619,6 +620,11 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">; } +let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def pmulhuw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">; + def pmulhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; +} + let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in { def maskloadd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int const *>, _Vector<8, int>)">; def maskloadq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int const *>, _Vector<4, long long int>)">; @@ -1429,7 +1435,10 @@ let Features = "avx512bitalg,evex512", Attributes = [NoThrow, Const, RequiredVec let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { def pmulhrsw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; - def pmulhuw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; +} + +let Features = "avx512bw,evex512", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { + def pmulhuw512 : X86Builtin<"_Vector<32, unsigned short>(_Vector<32, unsigned short>, _Vector<32, unsigned short>)">; def pmulhw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<32, short>)">; } diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3679327da7b0c..d9c6632c5c483 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11628,7 +11628,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } case Builtin::BI__builtin_elementwise_add_sat: - case Builtin::BI__builtin_elementwise_sub_sat: { + case Builtin::BI__builtin_elementwise_sub_sat: + case clang::X86::BI__builtin_ia32_pmulhuw128: + case clang::X86::BI__builtin_ia32_pmulhuw256: + case clang::X86::BI__builtin_ia32_pmulhuw512: + case clang::X86::BI__builtin_ia32_pmulhw128: + case clang::X86::BI__builtin_ia32_pmulhw256: + case clang::X86::BI__builtin_ia32_pmulhw512: { APValue SourceLHS, SourceRHS; if (!EvaluateAsRValue(Info, E->getArg(0), SourceLHS) || !EvaluateAsRValue(Info, E->getArg(1), SourceRHS)) @@ -11653,6 +11659,18 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { APSInt(LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS), DestEltTy->isUnsignedIntegerOrEnumerationType()))); break; + case clang::X86::BI__builtin_ia32_pmulhuw128: + case clang::X86::BI__builtin_ia32_pmulhuw256: + case clang::X86::BI__builtin_ia32_pmulhuw512: + ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhu(LHS, RHS), + /*isUnsigned=*/true))); + break; + case clang::X86::BI__builtin_ia32_pmulhw128: + case clang::X86::BI__builtin_ia32_pmulhw256: + case clang::X86::BI__builtin_ia32_pmulhw512: + ResultElements.push_back(APValue(APSInt(llvm::APIntOps::mulhs(LHS, RHS), + /*isUnsigned=*/false))); + break; } } diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index 3c3a3d1453934..55e7102a7d192 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -1729,10 +1729,10 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epu16(__m256i __a, __m256i __b) { - return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); + return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b); } /// Multiplies signed 16-bit integer elements of two 256-bit vectors of @@ -1748,7 +1748,7 @@ _mm256_mulhi_epu16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mulhi_epi16(__m256i __a, __m256i __b) { return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); @@ -1767,7 +1767,7 @@ _mm256_mulhi_epi16(__m256i __a, __m256i __b) /// \param __b /// A 256-bit vector of [16 x i16] containing one of the source operands. /// \returns A 256-bit vector of [16 x i16] containing the products. -static __inline__ __m256i __DEFAULT_FN_ATTRS256 +static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_mullo_epi16(__m256i __a, __m256i __b) { return (__m256i)((__v16hu)__a * (__v16hu)__b); diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index c854720de6a65..233d4a6574f30 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -25,6 +25,14 @@ typedef unsigned long long __mmask64; __attribute__((__always_inline__, __nodebug__, \ __target__("avx512bw,no-evex512"))) +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 constexpr +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS512_CONSTEXPR __DEFAULT_FN_ATTRS512 +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + static __inline __mmask32 __DEFAULT_FN_ATTRS _knot_mask32(__mmask32 __M) { @@ -438,7 +446,7 @@ _mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mullo_epi16 (__m512i __A, __m512i __B) { return (__m512i) ((__v32hu) __A * (__v32hu) __B); } @@ -1082,7 +1090,7 @@ _mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mulhi_epi16(__m512i __A, __m512i __B) { return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B); @@ -1105,10 +1113,10 @@ _mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) (__v32hi)_mm512_setzero_si512()); } -static __inline__ __m512i __DEFAULT_FN_ATTRS512 +static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_mulhi_epu16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B); + return (__m512i)__builtin_ia32_pmulhuw512((__v32hu) __A, (__v32hu) __B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -2010,5 +2018,7 @@ _mm512_sad_epu8 (__m512i __A, __m512i __B) #undef __DEFAULT_FN_ATTRS512 #undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS512_CONSTEXPR +#undef __DEFAULT_FN_ATTRS_CONSTEXPR #endif diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index 770bb5c4d0b99..60d2000dfb809 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2394,8 +2394,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a, /// A 128-bit signed [8 x i16] vector. /// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of /// each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mulhi_epi16(__m128i __a, __m128i __b) { return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); } @@ -2413,9 +2413,9 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a, /// A 128-bit unsigned [8 x i16] vector. /// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits /// of each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, - __m128i __b) { - return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mulhi_epu16(__m128i __a, __m128i __b) { + return (__m128i)__builtin_ia32_pmulhuw128((__v8hu)__a, (__v8hu)__b); } /// Multiplies the corresponding elements of two signed [8 x i16] @@ -2432,8 +2432,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a, /// A 128-bit signed [8 x i16] vector. /// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of /// each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a, - __m128i __b) { +static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_mullo_epi16(__m128i __a, __m128i __b) { return (__m128i)((__v8hu)__a * (__v8hu)__b); } diff --git a/clang/lib/Headers/mmintrin.h b/clang/lib/Headers/mmintrin.h index 5a02a45512345..3961b790cea74 100644 --- a/clang/lib/Headers/mmintrin.h +++ b/clang/lib/Headers/mmintrin.h @@ -57,6 +57,9 @@ typedef char __v16qi __attribute__((__vector_size__(16))); #define __trunc64(x) \ (__m64) __builtin_shufflevector((__v2di)(x), __extension__(__v2di){}, 0) +#define __zext128(x) \ + (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ + 1, 2, 3) #define __anyext128(x) \ (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0, \ 1, -1, -1) @@ -723,11 +726,11 @@ _mm_madd_pi16(__m64 __m1, __m64 __m2) /// A 64-bit integer vector of [4 x i16]. /// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits /// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { - return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__anyext128(__m1), - (__v8hi)__anyext128(__m2))); + return __trunc64(__builtin_ia32_pmulhw128((__v8hi)__zext128(__m1), + (__v8hi)__zext128(__m2))); } /// Multiplies each 16-bit signed integer element of the first 64-bit diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 6a6436977303f..7bf6b84c328dc 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -24,6 +24,7 @@ typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); /* Unsigned types */ typedef unsigned int __v4su __attribute__((__vector_size__(16))); +typedef unsigned short __v8hu __attribute__((__vector_size__(16))); /* This header should only be included in a hosted environment as it depends on * a standard library to provide allocation routines. */ @@ -2447,11 +2448,11 @@ _mm_movemask_pi8(__m64 __a) /// \param __b /// A 64-bit integer vector containing one of the source operands. /// \returns A 64-bit integer vector containing the products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2 +static __inline__ __m64 __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR _mm_mulhi_pu16(__m64 __a, __m64 __b) { - return __trunc64(__builtin_ia32_pmulhuw128((__v8hi)__anyext128(__a), - (__v8hi)__anyext128(__b))); + return __trunc64(__builtin_ia32_pmulhuw128((__v8hu)__zext128(__a), + (__v8hu)__zext128(__b))); } /// Shuffles the 4 16-bit integers from a 64-bit integer vector to the diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 1ed624cf63ab2..adbb854660a8b 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -893,12 +893,14 @@ __m256i test_mm256_mulhi_epu16(__m256i a, __m256i b) { // CHECK: call <16 x i16> @llvm.x86.avx2.pmulhu.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_mulhi_epu16(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_mulhi_epu16((__m256i)(__v16hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16}, (__m256i)(__v16hi){-32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), 0, -32, 0, 25, 4, -28, 0, 17, 8, -24, 0, 9, 12, 5, 14, 1)); __m256i test_mm256_mulhi_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mulhi_epi16 // CHECK: call <16 x i16> @llvm.x86.avx2.pmulh.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_mulhi_epi16(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_mulhi_epi16((__m256i)(__v16hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16}, (__m256i)(__v16hi){-32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, -1, -1, -1)); __m256i test_mm256_mulhrs_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mulhrs_epi16 @@ -911,6 +913,7 @@ __m256i test_mm256_mullo_epi16(__m256i a, __m256i b) { // CHECK: mul <16 x i16> return _mm256_mullo_epi16(a, b); } +TEST_CONSTEXPR(match_v16hi(_mm256_mullo_epi16((__m256i)(__v16hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16}, (__m256i)(__v16hi){-32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), -32, 60, 84, -104, -120, 132, 140, -144, -144, 140, 132, -120, -104, -84, -60, -32)); __m256i test_mm256_mullo_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_mullo_epi32 diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c index 1d18ca8548a3a..37765eb9616b3 100644 --- a/clang/test/CodeGen/X86/avx512bw-builtins.c +++ b/clang/test/CodeGen/X86/avx512bw-builtins.c @@ -3,6 +3,7 @@ #include +#include "builtin_test_helpers.h" __mmask32 test_knot_mask32(__mmask32 a) { // CHECK-LABEL: @test_knot_mask32 @@ -823,6 +824,7 @@ __m512i test_mm512_mullo_epi16 (__m512i __A, __m512i __B) { //CHECK: mul <32 x i16> return _mm512_mullo_epi16(__A, __B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mullo_epi16((__m512i)(__v32hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32}, (__m512i)(__v32hi){-64, -62, +60, +58, -56, -54, +52, +50, -48, -46, +44, +42, -40, -38, +36, +34, -32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), -64, 124, 180, -232, -280, 324, 364, -400, -432, 460, 484, -504, -520, 532, 540, -544, -544, 540, 532, -520, -504, 484, 460, -432, -400, 364, 324, -280, -232, -180, -124, -64)); __m512i test_mm512_mask_mullo_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { //CHECK-LABEL: @test_mm512_mask_mullo_epi16 @@ -1331,29 +1333,36 @@ __m512i test_mm512_mulhi_epi16(__m512i __A, __m512i __B) { // CHECK: @llvm.x86.avx512.pmulh.w.512 return _mm512_mulhi_epi16(__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mulhi_epi16((__m512i)(__v32hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32}, (__m512i)(__v32hi){-64, -62, +60, +58, -56, -54, +52, +50, -48, -46, +44, +42, -40, -38, +36, +34, -32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, -1, -1, -1)); + __m512i test_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_mulhi_epi16 // CHECK: @llvm.x86.avx512.pmulh.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_mulhi_epi16(__W,__U,__A,__B); } + __m512i test_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_mulhi_epi16 // CHECK: @llvm.x86.avx512.pmulh.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_mulhi_epi16(__U,__A,__B); } + __m512i test_mm512_mulhi_epu16(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mulhi_epu16 // CHECK: @llvm.x86.avx512.pmulhu.w.512 return _mm512_mulhi_epu16(__A,__B); } +TEST_CONSTEXPR(match_v32hi(_mm512_mulhi_epu16((__m512i)(__v32hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16, +17, -18, +19, -20, +21, -22, +23, -24, +25, -26, +27, -28, +29, -30, +31, -32}, (__m512i)(__v32hi){-64, -62, +60, +58, -56, -54, +52, +50, -48, -46, +44, +42, -40, -38, +36, +34, -32, -30, +28, +26, -24, -22, +20, +18, -16, -14, +12, +10, -8, +6, -4, +2}), 0, -64, 0, 57, 4, -60, 0, 49, 8, -56, 0, 41, 12, -52, 0, 33, 16, -48, 0, 25, 20, -44, 0, 17, 24, -40, 0, 9, 28, 5, 30, 1)); + __m512i test_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_mulhi_epu16 // CHECK: @llvm.x86.avx512.pmulhu.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_mulhi_epu16(__W,__U,__A,__B); } + __m512i test_mm512_maskz_mulhi_epu16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_mulhi_epu16 // CHECK: @llvm.x86.avx512.pmulhu.w.512 diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c index 30e2cfeca1fd0..6f20986dde3cc 100644 --- a/clang/test/CodeGen/X86/mmx-builtins.c +++ b/clang/test/CodeGen/X86/mmx-builtins.c @@ -365,12 +365,14 @@ __m64 test_mm_mulhi_pi16(__m64 a, __m64 b) { // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w( return _mm_mulhi_pi16(a, b); } +TEST_CONSTEXPR(match_v4hi(_mm_mulhi_pi16((__m64)(__v4hi){+1, -2, +3, -4}, (__m64)(__v4hi){-10, +8, +6, -4}), -1, -1, 0, 0)); __m64 test_mm_mulhi_pu16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhi_pu16 // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w( return _mm_mulhi_pu16(a, b); } +TEST_CONSTEXPR(match_v4hi(_mm_mulhi_pu16((__m64)(__v4hi){+1, -2, +3, -4}, (__m64)(__v4hi){-10, +8, +6, -4}), 0, 7, 0, -8)); __m64 test_mm_mulhrs_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhrs_pi16 diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c index affb6e7d3b7d9..23013dd75d641 100644 --- a/clang/test/CodeGen/X86/sse2-builtins.c +++ b/clang/test/CodeGen/X86/sse2-builtins.c @@ -940,18 +940,21 @@ __m128i test_mm_mulhi_epi16(__m128i A, __m128i B) { // CHECK: call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_mulhi_epi16(A, B); } +TEST_CONSTEXPR(match_v8hi(_mm_mulhi_epi16((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -1, 0, 0, -1, -1, -1, -1, -1)); __m128i test_mm_mulhi_epu16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_mulhi_epu16 // CHECK: call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_mulhi_epu16(A, B); } +TEST_CONSTEXPR(match_v8hi(_mm_mulhi_epu16((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), 0, -16, 0, 9, 4, 5, 6, 1)); __m128i test_mm_mullo_epi16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_mullo_epi16 // CHECK: mul <8 x i16> %{{.*}}, %{{.*}} return _mm_mullo_epi16(A, B); } +TEST_CONSTEXPR(match_v8hi(_mm_mullo_epi16((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-16, -14, +12, +10, -8, +6, -4, +2}), -16, 28, 36, -40, -40, -36, -28, -16)); __m128d test_mm_or_pd(__m128d A, __m128d B) { // CHECK-LABEL: test_mm_or_pd diff --git a/clang/test/CodeGen/builtins-x86.c b/clang/test/CodeGen/builtins-x86.c index c42c3216ec53c..31f309791c9f7 100644 --- a/clang/test/CodeGen/builtins-x86.c +++ b/clang/test/CodeGen/builtins-x86.c @@ -22,6 +22,7 @@ typedef float V2f __attribute__((vector_size(8))); // 128-bit typedef char V16c __attribute__((vector_size(16))); typedef signed short V8s __attribute__((vector_size(16))); +typedef unsigned short V8u __attribute__((vector_size(16))); typedef signed int V4i __attribute__((vector_size(16))); #ifndef OPENCL typedef signed long long V2LLi __attribute__((vector_size(16))); @@ -99,6 +100,7 @@ void f0(void) { // 128-bit V16c tmp_V16c; V8s tmp_V8s; + V8u tmp_V8u; V4i tmp_V4i; V2LLi tmp_V2LLi; V4f tmp_V4f; @@ -192,7 +194,7 @@ void f0(void) { tmp_V16c = __builtin_ia32_packsswb128(tmp_V8s, tmp_V8s); tmp_V8s = __builtin_ia32_packssdw128(tmp_V4i, tmp_V4i); tmp_V16c = __builtin_ia32_packuswb128(tmp_V8s, tmp_V8s); - tmp_V8s = __builtin_ia32_pmulhuw128(tmp_V8s, tmp_V8s); + tmp_V8u = __builtin_ia32_pmulhuw128(tmp_V8u, tmp_V8u); tmp_V4f = __builtin_ia32_addsubps(tmp_V4f, tmp_V4f); tmp_V2d = __builtin_ia32_addsubpd(tmp_V2d, tmp_V2d); tmp_V4f = __builtin_ia32_haddps(tmp_V4f, tmp_V4f);