Skip to content

Commit 5169bb4

Browse files
authored
[clang][x86] Change SLLDQ/SRLDQ byte shift intrinsics to use vXi8 types instead of vXi64 (#158671)
As noted on #156494 and #157403 - its much easier to work with the byte shift intrinsics if we treat them as vXi8 types instead of vXi64 types which will require bitcasting We already do this for the PALIGNR intrinsics which are a more advanced version of the same shuffle
1 parent 67c335c commit 5169bb4

File tree

8 files changed

+56
-53
lines changed

8 files changed

+56
-53
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -277,8 +277,8 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
277277
def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
278278
def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
279279
def pmaddwd128 : X86Builtin<"_Vector<4, int>(_Vector<8, short>, _Vector<8, short>)">;
280-
def pslldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
281-
def psrldqi128_byteshift : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Constant int)">;
280+
def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
281+
def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
282282
}
283283

284284
let Features = "sse2",
@@ -594,12 +594,12 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
594594
def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
595595
def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
596596
def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
597-
def pslldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
597+
def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
598598
def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
599599
def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
600600
def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
601601
def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
602-
def psrldqi256_byteshift : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
602+
def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
603603
def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
604604
def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
605605
def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
@@ -2052,8 +2052,8 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
20522052
: X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
20532053
def psrlw512
20542054
: X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
2055-
def pslldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
2056-
def psrldqi512_byteshift : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Constant int)">;
2055+
def pslldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
2056+
def psrldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
20572057
}
20582058

20592059
let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<128>] in {

clang/lib/CodeGen/TargetBuiltins/X86.cpp

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1814,59 +1814,53 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
18141814
case X86::BI__builtin_ia32_pslldqi256_byteshift:
18151815
case X86::BI__builtin_ia32_pslldqi512_byteshift: {
18161816
unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1817-
auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
1818-
// Builtin type is vXi64 so multiply by 8 to get bytes.
1819-
unsigned NumElts = ResultType->getNumElements() * 8;
1817+
auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
1818+
// Builtin type is vXi8.
1819+
unsigned NumElts = VecTy->getNumElements();
1820+
Value *Zero = llvm::Constant::getNullValue(VecTy);
18201821

18211822
// If pslldq is shifting the vector more than 15 bytes, emit zero.
18221823
if (ShiftVal >= 16)
1823-
return llvm::Constant::getNullValue(ResultType);
1824+
return Zero;
18241825

18251826
int Indices[64];
18261827
// 256/512-bit pslldq operates on 128-bit lanes so we need to handle that
18271828
for (unsigned l = 0; l != NumElts; l += 16) {
18281829
for (unsigned i = 0; i != 16; ++i) {
18291830
unsigned Idx = NumElts + i - ShiftVal;
1830-
if (Idx < NumElts) Idx -= NumElts - 16; // end of lane, switch operand.
1831+
if (Idx < NumElts)
1832+
Idx -= NumElts - 16; // end of lane, switch operand.
18311833
Indices[l + i] = Idx + l;
18321834
}
18331835
}
1834-
1835-
auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
1836-
Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
1837-
Value *Zero = llvm::Constant::getNullValue(VecTy);
1838-
Value *SV = Builder.CreateShuffleVector(
1839-
Zero, Cast, ArrayRef(Indices, NumElts), "pslldq");
1840-
return Builder.CreateBitCast(SV, Ops[0]->getType(), "cast");
1836+
return Builder.CreateShuffleVector(Zero, Ops[0], ArrayRef(Indices, NumElts),
1837+
"pslldq");
18411838
}
18421839
case X86::BI__builtin_ia32_psrldqi128_byteshift:
18431840
case X86::BI__builtin_ia32_psrldqi256_byteshift:
18441841
case X86::BI__builtin_ia32_psrldqi512_byteshift: {
18451842
unsigned ShiftVal = cast<llvm::ConstantInt>(Ops[1])->getZExtValue() & 0xff;
1846-
auto *ResultType = cast<llvm::FixedVectorType>(Ops[0]->getType());
1847-
// Builtin type is vXi64 so multiply by 8 to get bytes.
1848-
unsigned NumElts = ResultType->getNumElements() * 8;
1843+
auto *VecTy = cast<llvm::FixedVectorType>(Ops[0]->getType());
1844+
// Builtin type is vXi8.
1845+
unsigned NumElts = VecTy->getNumElements();
1846+
Value *Zero = llvm::Constant::getNullValue(VecTy);
18491847

18501848
// If psrldq is shifting the vector more than 15 bytes, emit zero.
18511849
if (ShiftVal >= 16)
1852-
return llvm::Constant::getNullValue(ResultType);
1850+
return Zero;
18531851

18541852
int Indices[64];
18551853
// 256/512-bit psrldq operates on 128-bit lanes so we need to handle that
18561854
for (unsigned l = 0; l != NumElts; l += 16) {
18571855
for (unsigned i = 0; i != 16; ++i) {
18581856
unsigned Idx = i + ShiftVal;
1859-
if (Idx >= 16) Idx += NumElts - 16; // end of lane, switch operand.
1857+
if (Idx >= 16)
1858+
Idx += NumElts - 16; // end of lane, switch operand.
18601859
Indices[l + i] = Idx + l;
18611860
}
18621861
}
1863-
1864-
auto *VecTy = llvm::FixedVectorType::get(Int8Ty, NumElts);
1865-
Value *Cast = Builder.CreateBitCast(Ops[0], VecTy, "cast");
1866-
Value *Zero = llvm::Constant::getNullValue(VecTy);
1867-
Value *SV = Builder.CreateShuffleVector(
1868-
Cast, Zero, ArrayRef(Indices, NumElts), "psrldq");
1869-
return Builder.CreateBitCast(SV, ResultType, "cast");
1862+
return Builder.CreateShuffleVector(Ops[0], Zero, ArrayRef(Indices, NumElts),
1863+
"psrldq");
18701864
}
18711865
case X86::BI__builtin_ia32_kshiftliqi:
18721866
case X86::BI__builtin_ia32_kshiftlihi:

clang/lib/Headers/avx2intrin.h

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2060,8 +2060,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
20602060
/// \param imm
20612061
/// An unsigned immediate value specifying the shift count (in bytes).
20622062
/// \returns A 256-bit integer vector containing the result.
2063-
#define _mm256_slli_si256(a, imm) \
2064-
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2063+
#define _mm256_slli_si256(a, imm) \
2064+
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2065+
(int)(imm)))
20652066

20662067
/// Shifts each 128-bit half of the 256-bit integer vector \a a left by
20672068
/// \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
@@ -2080,8 +2081,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
20802081
/// \param imm
20812082
/// An unsigned immediate value specifying the shift count (in bytes).
20822083
/// \returns A 256-bit integer vector containing the result.
2083-
#define _mm256_bslli_epi128(a, imm) \
2084-
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
2084+
#define _mm256_bslli_epi128(a, imm) \
2085+
((__m256i)__builtin_ia32_pslldqi256_byteshift((__v32qi)(__m256i)(a), \
2086+
(int)(imm)))
20852087

20862088
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
20872089
/// left by \a __count bits, shifting in zero bits, and returns the result.
@@ -2299,8 +2301,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
22992301
/// \param imm
23002302
/// An unsigned immediate value specifying the shift count (in bytes).
23012303
/// \returns A 256-bit integer vector containing the result.
2302-
#define _mm256_srli_si256(a, imm) \
2303-
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2304+
#define _mm256_srli_si256(a, imm) \
2305+
((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2306+
(int)(imm)))
23042307

23052308
/// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
23062309
/// \a imm bytes, shifting in zero bytes, and returns the result. If
@@ -2319,8 +2322,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
23192322
/// \param imm
23202323
/// An unsigned immediate value specifying the shift count (in bytes).
23212324
/// \returns A 256-bit integer vector containing the result.
2322-
#define _mm256_bsrli_epi128(a, imm) \
2323-
((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
2325+
#define _mm256_bsrli_epi128(a, imm) \
2326+
((__m256i)__builtin_ia32_psrldqi256_byteshift((__v32qi)(__m256i)(a), \
2327+
(int)(imm)))
23242328

23252329
/// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
23262330
/// right by \a __count bits, shifting in zero bits, and returns the result.

clang/lib/Headers/avx512bwintrin.h

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,8 +1459,9 @@ _mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) {
14591459
(__v32hi)_mm512_setzero_si512());
14601460
}
14611461

1462-
#define _mm512_bslli_epi128(a, imm) \
1463-
((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
1462+
#define _mm512_bslli_epi128(a, imm) \
1463+
((__m512i)__builtin_ia32_pslldqi512_byteshift((__v64qi)(__m512i)(a), \
1464+
(int)(imm)))
14641465

14651466
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
14661467
_mm512_srlv_epi16(__m512i __A, __m512i __B)
@@ -1590,8 +1591,9 @@ _mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) {
15901591
(__v32hi)_mm512_setzero_si512());
15911592
}
15921593

1593-
#define _mm512_bsrli_epi128(a, imm) \
1594-
((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm)))
1594+
#define _mm512_bsrli_epi128(a, imm) \
1595+
((__m512i)__builtin_ia32_psrldqi512_byteshift((__v64qi)(__m512i)(a), \
1596+
(int)(imm)))
15951597

15961598
static __inline__ __m512i __DEFAULT_FN_ATTRS512
15971599
_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A)

clang/lib/Headers/emmintrin.h

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2745,11 +2745,11 @@ _mm_xor_si128(__m128i __a, __m128i __b) {
27452745
/// \a a.
27462746
/// \returns A 128-bit integer vector containing the left-shifted value.
27472747
#define _mm_slli_si128(a, imm) \
2748-
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2748+
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \
27492749
(int)(imm)))
27502750

27512751
#define _mm_bslli_si128(a, imm) \
2752-
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
2752+
((__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)(__m128i)(a), \
27532753
(int)(imm)))
27542754

27552755
/// Left-shifts each 16-bit value in the 128-bit integer vector operand
@@ -2954,11 +2954,11 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
29542954
/// \a a.
29552955
/// \returns A 128-bit integer vector containing the right-shifted value.
29562956
#define _mm_srli_si128(a, imm) \
2957-
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2957+
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \
29582958
(int)(imm)))
29592959

29602960
#define _mm_bsrli_si128(a, imm) \
2961-
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
2961+
((__m128i)__builtin_ia32_psrldqi128_byteshift((__v16qi)(__m128i)(a), \
29622962
(int)(imm)))
29632963

29642964
/// Right-shifts each of 16-bit values in the 128-bit integer vector

clang/lib/Headers/tmmintrin.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,11 +175,12 @@ _mm_abs_epi32(__m128i __a) {
175175
/// An immediate operand specifying how many bytes to right-shift the result.
176176
/// \returns A 64-bit integer vector containing the concatenated right-shifted
177177
/// value.
178-
#define _mm_alignr_pi8(a, b, n) \
179-
((__m64)__builtin_shufflevector( \
180-
__builtin_ia32_psrldqi128_byteshift( \
181-
__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \
182-
(n)), __extension__ (__v2di){}, 0))
178+
#define _mm_alignr_pi8(a, b, n) \
179+
((__m64)__builtin_shufflevector( \
180+
(__v2di)__builtin_ia32_psrldqi128_byteshift( \
181+
(__v16qi)__builtin_shufflevector((__v1di)(a), (__v1di)(b), 1, 0), \
182+
(n)), \
183+
__extension__(__v2di){}, 0))
183184

184185
/// Horizontally adds the adjacent pairs of values contained in 2 packed
185186
/// 128-bit vectors of [8 x i16].

clang/lib/Headers/xmmintrin.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2520,8 +2520,8 @@ _mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
25202520
// If there's a risk of spurious trap due to a 128-bit write, back up the
25212521
// pointer by 8 bytes and shift values in registers to match.
25222522
__p -= 8;
2523-
__d128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__d128, 8);
2524-
__n128 = __builtin_ia32_pslldqi128_byteshift((__v2di)__n128, 8);
2523+
__d128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__d128, 8);
2524+
__n128 = (__m128i)__builtin_ia32_pslldqi128_byteshift((__v16qi)__n128, 8);
25252525
}
25262526

25272527
__builtin_ia32_maskmovdqu((__v16qi)__d128, (__v16qi)__n128, __p);

clang/test/CodeGen/X86/sse.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ __m128i test_mm_slli_si128_0(__m128i a) {
3232
// CHECK-LABEL: define dso_local <2 x i64> @test_mm_slli_si128_16(
3333
// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
3434
// CHECK-NEXT: [[ENTRY:.*:]]
35+
// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
3536
// CHECK-NEXT: ret <2 x i64> zeroinitializer
3637
//
3738
__m128i test_mm_slli_si128_16(__m128i a) {
@@ -65,6 +66,7 @@ __m128i test_mm_srli_si128_0(__m128i a) {
6566
// CHECK-LABEL: define dso_local <2 x i64> @test_mm_srli_si128_16(
6667
// CHECK-SAME: <2 x i64> noundef [[A:%.*]]) #[[ATTR0]] {
6768
// CHECK-NEXT: [[ENTRY:.*:]]
69+
// CHECK-NEXT: [[CAST:%.*]] = bitcast <2 x i64> [[A]] to <16 x i8>
6870
// CHECK-NEXT: ret <2 x i64> zeroinitializer
6971
//
7072
__m128i test_mm_srli_si128_16(__m128i a) {

0 commit comments

Comments
 (0)