diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index dd7727a39f693..aac502091b57e 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -277,8 +277,8 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">; def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">; def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">; - def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">; - def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">; + def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; + def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">; } let Features = "sse2", @@ -594,12 +594,12 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; - def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; + def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; - def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; + def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">; def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; @@ -2052,8 +2052,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; def psrlw512 : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">; - def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">; - def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">; + def pslldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">; + def psrldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">; } let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in { diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp index a4974e45caf10..b924407b6ddd7 100644 --- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp @@ -1814,59 +1814,53 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID, case X86::BI__builtin_ia32_pslldqi256_byteshift: case X86::BI__builtin_ia32_pslldqi512_byteshift: { unsigned ShiftVal = cast(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast(Ops[0]->getType()); - // Builtin type is vXi64 so multiply by 8 to get bytes. - unsigned NumElts = ResultType->getNumElements() * 8; + auto *VecTy = cast(Ops[0]->getType()); + // Builtin type is vXi8. + unsigned NumElts = VecTy->getNumElements(); + Value *Zero = llvm::Constant::getNullValue(VecTy); // If pslldq is shifting the vector more than 15 bytes, emit zero. if (ShiftVal >= 16) - return llvm::Constant::getNullValue(ResultType); + return Zero; int Indices[64]; // 256/512-bit pslldq operates on 128-bit lanes so we need to handle that for (unsigned l = 0; l != NumElts; l += 16) { for (unsigned i = 0; i != 16; ++i) { unsigned Idx = NumElts + i - ShiftVal; - if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand. + if (Idx < NumElts) + Idx -= NumElts - 16; // end of lane, switch operand. Indices[l + i] = Idx + l; } } - - auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); - Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); - Value *SV = Builder.CreateShuffleVector( - Zero, Cast, ArrayRef(Indices, NumElts), "pslldq"); - return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast"); + return Builder.CreateShuffleVector(Zero, Ops[0], ArrayRef(Indices, NumElts), + "pslldq"); } case X86::BI__builtin_ia32_psrldqi128_byteshift: case X86::BI__builtin_ia32_psrldqi256_byteshift: case X86::BI__builtin_ia32_psrldqi512_byteshift: { unsigned ShiftVal = cast(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast(Ops[0]->getType()); - // Builtin type is vXi64 so multiply by 8 to get bytes. - unsigned NumElts = ResultType->getNumElements() * 8; + auto *VecTy = cast(Ops[0]->getType()); + // Builtin type is vXi8. + unsigned NumElts = VecTy->getNumElements(); + Value *Zero = llvm::Constant::getNullValue(VecTy); // If psrldq is shifting the vector more than 15 bytes, emit zero. if (ShiftVal >= 16) - return llvm::Constant::getNullValue(ResultType); + return Zero; int Indices[64]; // 256/512-bit psrldq operates on 128-bit lanes so we need to handle that for (unsigned l = 0; l != NumElts; l += 16) { for (unsigned i = 0; i != 16; ++i) { unsigned Idx = i + ShiftVal; - if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand. + if (Idx >= 16) + Idx += NumElts - 16; // end of lane, switch operand. Indices[l + i] = Idx + l; } } - - auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts); - Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast"); - Value *Zero = llvm::Constant::getNullValue(VecTy); - Value *SV = Builder.CreateShuffleVector( - Cast, Zero, ArrayRef(Indices, NumElts), "psrldq"); - return Builder.CreateBitCast(SV, ResultType, "cast"); + return Builder.CreateShuffleVector(Ops[0], Zero, ArrayRef(Indices, NumElts), + "psrldq"); } case X86::BI__builtin_ia32_kshiftliqi: case X86::BI__builtin_ia32_kshiftlihi: diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index fc12a9bf15e57..e35c159fec7fd 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -2060,8 +2060,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b) /// \param imm /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. -#define _mm256_slli_si256(a, imm) \ - ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) +#define _mm256_slli_si256(a, imm) \ + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \ + (int)(imm))) /// Shifts each 128-bit half of the 256-bit integer vector \a a left by /// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm @@ -2080,8 +2081,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b) /// \param imm /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. -#define _mm256_bslli_epi128(a, imm) \ - ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) +#define _mm256_bslli_epi128(a, imm) \ + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \ + (int)(imm))) /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a /// left by \a __count bits, shifting in zero bits, and returns the result. @@ -2299,8 +2301,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) /// \param imm /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. -#define _mm256_srli_si256(a, imm) \ - ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) +#define _mm256_srli_si256(a, imm) \ + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \ + (int)(imm))) /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by /// \a imm bytes, shifting in zero bytes, and returns the result. If @@ -2319,8 +2322,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) /// \param imm /// An unsigned immediate value specifying the shift count (in bytes). /// \returns A 256-bit integer vector containing the result. -#define _mm256_bsrli_epi128(a, imm) \ - ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) +#define _mm256_bsrli_epi128(a, imm) \ + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \ + (int)(imm))) /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a /// right by \a __count bits, shifting in zero bits, and returns the result. diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 77820a2ca041c..599cfbe479676 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -1459,8 +1459,9 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) { (__v32hi)_mm512_setzero_si512()); } -#define _mm512_bslli_epi128(a, imm) \ - ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) +#define _mm512_bslli_epi128(a, imm) \ + ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v64qi)(__m512i)(a), \ + (int)(imm))) static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR _mm512_srlv_epi16(__m512i __A, __m512i __B) @@ -1590,8 +1591,9 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) { (__v32hi)_mm512_setzero_si512()); } -#define _mm512_bsrli_epi128(a, imm) \ - ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) +#define _mm512_bsrli_epi128(a, imm) \ + ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a), \ + (int)(imm))) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index e4fbe011239d6..12260ec6ea14c 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2745,11 +2745,11 @@ _mm_xor_si128(__m128i __a, __m128i __b) { /// \a a. /// \returns A 128-bit integer vector containing the left-shifted value. #define _mm_slli_si128(a, imm) \ - ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) #define _mm_bslli_si128(a, imm) \ - ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) /// Left-shifts each 16-bit value in the 128-bit integer vector operand @@ -2954,11 +2954,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a, /// \a a. /// \returns A 128-bit integer vector containing the right-shifted value. #define _mm_srli_si128(a, imm) \ - ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) #define _mm_bsrli_si128(a, imm) \ - ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \ + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \ (int)(imm))) /// Right-shifts each of 16-bit values in the 128-bit integer vector diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h index 6e7107e36ea79..d40f0c56b2c5a 100644 --- a/clang/lib/Headers/tmmintrin.h +++ b/clang/lib/Headers/tmmintrin.h @@ -175,11 +175,12 @@ _mm_abs_epi32(__m128i __a) { /// An immediate operand specifying how many bytes to right-shift the result. /// \returns A 64-bit integer vector containing the concatenated right-shifted /// value. -#define _mm_alignr_pi8(a, b, n) \ - ((__m64)__builtin_shufflevector( \ - __builtin_ia32_psrldqi128_byteshift( \ - __builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \ - (n)), __extension__ (__v2di){}, 0)) +#define _mm_alignr_pi8(a, b, n) \ + ((__m64)__builtin_shufflevector( \ + (__v2di)__builtin_ia32_psrldqi128_byteshift( \ + (__v16qi)__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \ + (n)), \ + __extension__(__v2di){}, 0)) /// Horizontally adds the adjacent pairs of values contained in 2 packed /// 128-bit vectors of [8 x i16]. diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h index 6b70f245e2564..4891e3ce077b5 100644 --- a/clang/lib/Headers/xmmintrin.h +++ b/clang/lib/Headers/xmmintrin.h @@ -2520,8 +2520,8 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) // If there's a risk of spurious trap due to a 128-bit write, back up the // pointer by 8 bytes and shift values in registers to match. __p -= 8; - __d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8); - __n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8); + __d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8); + __n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8); } __builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p); diff --git a/clang/test/CodeGen/X86/sse.c b/clang/test/CodeGen/X86/sse.c index 017bdd7846fa3..38cc7179543d5 100644 --- a/clang/test/CodeGen/X86/sse.c +++ b/clang/test/CodeGen/X86/sse.c @@ -32,6 +32,7 @@ __m128i test_mm_slli_si128_0(__m128i a) { // CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_16( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> zeroinitializer // __m128i test_mm_slli_si128_16(__m128i a) { @@ -65,6 +66,7 @@ __m128i test_mm_srli_si128_0(__m128i a) { // CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_16( // CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8> // CHECK-NEXT: ret <2 x i64> zeroinitializer // __m128i test_mm_srli_si128_16(__m128i a) {