Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,8 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
}

let Features = "sse2",
Expand Down Expand Up @@ -594,12 +594,12 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
Expand Down Expand Up @@ -2052,8 +2052,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
: X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
def psrlw512
: X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
def pslldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
def psrldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
}

let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
Expand Down
42 changes: 18 additions & 24 deletions clang/lib/CodeGen/TargetBuiltins/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1814,59 +1814,53 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
case X86::BI__builtin_ia32_pslldqi256_byteshift:
case X86::BI__builtin_ia32_pslldqi512_byteshift: {
unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
// Builtin type is vXi64 so multiply by 8 to get bytes.
unsigned NumElts = ResultType->getNumElements() * 8;
auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
// Builtin type is vXi8.
unsigned NumElts = VecTy->getNumElements();
Value *Zero = llvm::Constant::getNullValue(VecTy);

// If pslldq is shifting the vector more than 15 bytes, emit zero.
if (ShiftVal >= 16)
return llvm::Constant::getNullValue(ResultType);
return Zero;

int Indices[64];
// 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
for (unsigned l = 0; l != NumElts; l += 16) {
for (unsigned i = 0; i != 16; ++i) {
unsigned Idx = NumElts + i - ShiftVal;
if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
if (Idx < NumElts)
Idx -= NumElts - 16; // end of lane, switch operand.
Indices[l + i] = Idx + l;
}
}

auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
Value *Zero = llvm::Constant::getNullValue(VecTy);
Value *SV = Builder.CreateShuffleVector(
Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
return Builder.CreateShuffleVector(Zero, Ops[0], ArrayRef(Indices, NumElts),
"pslldq");
}
case X86::BI__builtin_ia32_psrldqi128_byteshift:
case X86::BI__builtin_ia32_psrldqi256_byteshift:
case X86::BI__builtin_ia32_psrldqi512_byteshift: {
unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
// Builtin type is vXi64 so multiply by 8 to get bytes.
unsigned NumElts = ResultType->getNumElements() * 8;
auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
// Builtin type is vXi8.
unsigned NumElts = VecTy->getNumElements();
Value *Zero = llvm::Constant::getNullValue(VecTy);

// If psrldq is shifting the vector more than 15 bytes, emit zero.
if (ShiftVal >= 16)
return llvm::Constant::getNullValue(ResultType);
return Zero;

int Indices[64];
// 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
for (unsigned l = 0; l != NumElts; l += 16) {
for (unsigned i = 0; i != 16; ++i) {
unsigned Idx = i + ShiftVal;
if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
if (Idx >= 16)
Idx += NumElts - 16; // end of lane, switch operand.
Indices[l + i] = Idx + l;
}
}

auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
Value *Zero = llvm::Constant::getNullValue(VecTy);
Value *SV = Builder.CreateShuffleVector(
Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
return Builder.CreateBitCast(SV, ResultType, "cast");
return Builder.CreateShuffleVector(Ops[0], Zero, ArrayRef(Indices, NumElts),
"psrldq");
}
case X86::BI__builtin_ia32_kshiftliqi:
case X86::BI__builtin_ia32_kshiftlihi:
Expand Down
20 changes: 12 additions & 8 deletions clang/lib/Headers/avx2intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -2060,8 +2060,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
/// \param imm
/// An unsigned immediate value specifying the shift count (in bytes).
/// \returns A 256-bit integer vector containing the result.
#define _mm256_slli_si256(a, imm) \
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
#define _mm256_slli_si256(a, imm) \
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
(int)(imm)))

/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
Expand All @@ -2080,8 +2081,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
/// \param imm
/// An unsigned immediate value specifying the shift count (in bytes).
/// \returns A 256-bit integer vector containing the result.
#define _mm256_bslli_epi128(a, imm) \
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
#define _mm256_bslli_epi128(a, imm) \
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
(int)(imm)))

/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
/// left by \a __count bits, shifting in zero bits, and returns the result.
Expand Down Expand Up @@ -2299,8 +2301,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
/// \param imm
/// An unsigned immediate value specifying the shift count (in bytes).
/// \returns A 256-bit integer vector containing the result.
#define _mm256_srli_si256(a, imm) \
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
#define _mm256_srli_si256(a, imm) \
((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
(int)(imm)))

/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
/// \a imm bytes, shifting in zero bytes, and returns the result. If
Expand All @@ -2319,8 +2322,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
/// \param imm
/// An unsigned immediate value specifying the shift count (in bytes).
/// \returns A 256-bit integer vector containing the result.
#define _mm256_bsrli_epi128(a, imm) \
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
#define _mm256_bsrli_epi128(a, imm) \
((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
(int)(imm)))

/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
/// right by \a __count bits, shifting in zero bits, and returns the result.
Expand Down
10 changes: 6 additions & 4 deletions clang/lib/Headers/avx512bwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1459,8 +1459,9 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) {
(__v32hi)_mm512_setzero_si512());
}

#define _mm512_bslli_epi128(a, imm) \
((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
#define _mm512_bslli_epi128(a, imm) \
((__m512i)__builtin_ia32_pslldqi512_byteshift((__v64qi)(__m512i)(a), \
(int)(imm)))

static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_srlv_epi16(__m512i __A, __m512i __B)
Expand Down Expand Up @@ -1590,8 +1591,9 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) {
(__v32hi)_mm512_setzero_si512());
}

#define _mm512_bsrli_epi128(a, imm) \
((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
#define _mm512_bsrli_epi128(a, imm) \
((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a), \
(int)(imm)))

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)
Expand Down
8 changes: 4 additions & 4 deletions clang/lib/Headers/emmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -2745,11 +2745,11 @@ _mm_xor_si128(__m128i __a, __m128i __b) {
/// \a a.
/// \returns A 128-bit integer vector containing the left-shifted value.
#define _mm_slli_si128(a, imm) \
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \
(int)(imm)))

#define _mm_bslli_si128(a, imm) \
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \
(int)(imm)))

/// Left-shifts each 16-bit value in the 128-bit integer vector operand
Expand Down Expand Up @@ -2954,11 +2954,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
/// \a a.
/// \returns A 128-bit integer vector containing the right-shifted value.
#define _mm_srli_si128(a, imm) \
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \
(int)(imm)))

#define _mm_bsrli_si128(a, imm) \
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \
(int)(imm)))

/// Right-shifts each of 16-bit values in the 128-bit integer vector
Expand Down
11 changes: 6 additions & 5 deletions clang/lib/Headers/tmmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,11 +175,12 @@ _mm_abs_epi32(__m128i __a) {
/// An immediate operand specifying how many bytes to right-shift the result.
/// \returns A 64-bit integer vector containing the concatenated right-shifted
/// value.
#define _mm_alignr_pi8(a, b, n) \
((__m64)__builtin_shufflevector( \
__builtin_ia32_psrldqi128_byteshift( \
__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \
(n)), __extension__ (__v2di){}, 0))
#define _mm_alignr_pi8(a, b, n) \
((__m64)__builtin_shufflevector( \
(__v2di)__builtin_ia32_psrldqi128_byteshift( \
(__v16qi)__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \
(n)), \
__extension__(__v2di){}, 0))

/// Horizontally adds the adjacent pairs of values contained in 2 packed
/// 128-bit vectors of [8 x i16].
Expand Down
4 changes: 2 additions & 2 deletions clang/lib/Headers/xmmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -2520,8 +2520,8 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
// If there's a risk of spurious trap due to a 128-bit write, back up the
// pointer by 8 bytes and shift values in registers to match.
__p -= 8;
__d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
__n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
__d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8);
__n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8);
}

__builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGen/X86/sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ __m128i test_mm_slli_si128_0(__m128i a) {
// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_16(
// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> zeroinitializer
//
__m128i test_mm_slli_si128_16(__m128i a) {
Expand Down Expand Up @@ -65,6 +66,7 @@ __m128i test_mm_srli_si128_0(__m128i a) {
// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_16(
// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
// CHECK-NEXT: ret <2 x i64> zeroinitializer
//
__m128i test_mm_srli_si128_16(__m128i a) {
Expand Down