Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -124,14 +124,14 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
}

def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
}

let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
}
}

Expand Down Expand Up @@ -588,7 +588,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
Expand Down Expand Up @@ -627,6 +626,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;
def pmuludq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;

def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;

def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
Expand Down Expand Up @@ -1318,14 +1319,15 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>

let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
def ucmpw512_mask : X86Builtin<"unsigned int(_Vector<32, short>, _Vector<32, short>, _Constant int, unsigned int)">;
def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}

let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def packsswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def packssdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;
def packuswb512 : X86Builtin<"_Vector<64, char>(_Vector<32, short>, _Vector<32, short>)">;
def packusdw512 : X86Builtin<"_Vector<32, short>(_Vector<16, int>, _Vector<16, int>)">;

def pshufb512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>)">;
}

let Features = "avx512cd,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
Expand Down
40 changes: 40 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2714,6 +2714,41 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp__builtin_ia32_pshufb(InterpState &S, CodePtr OpPC,
const CallExpr *Call) {
assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
const Pointer &Control = S.Stk.pop<Pointer>();
const Pointer &Src = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

unsigned NumElems = Dst.getNumElems();
PrimType ElemT = Dst.getFieldDesc()->getPrimType();
unsigned ElemBits = static_cast<unsigned>(primSize(ElemT) * 8);

assert(NumElems == 16 || NumElems == 32 || NumElems == 64);
assert(NumElems == Control.getNumElems());
assert(NumElems == Dst.getNumElems());

if (ElemBits != 8)
return false;

for (unsigned Idx = 0; Idx != NumElems; ++Idx) {
uint8_t Ctlb = static_cast<uint8_t>(Control.elem<int8_t>(Idx));

if (Ctlb & 0x80) {
Dst.elem<int8_t>(Idx) = 0;
} else {
unsigned LaneBase = (Idx / 16) * 16;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this just a right shift and a left shift? That would seem more readable code, intent wise.

Also 16 is a magic number, can we name it.

Same applies in the code below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I can change that ,so LaneBase will then be (Idx >> 4) <<4, then there is no need for naming 16 , and i can also name 0x80 and 0x0F MSBMask and LowNibbleMask for clarity.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure if MSBMask/LowNibbleMask obfuscation is a great idea - keeping reasonably close to the psuedocode in the Intel Intrinsics Guide is probably a better guideline.

unsigned SrcOffset = Ctlb & 0x0F;
unsigned SrcIdx = LaneBase + SrcOffset;

Dst.elem<int8_t>(Idx) = Src.elem<int8_t>(SrcIdx);
}
}
Dst.initializeAllElements();
return true;
}

static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC,
const CallExpr *Call, bool IsShufHW) {
assert(Call->getNumArgs() == 2 && "masked forms handled via select*");
Expand Down Expand Up @@ -3739,6 +3774,11 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_selectpd_512:
return interp__builtin_select(S, OpPC, Call);

case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512:
return interp__builtin_ia32_pshufb(S, OpPC, Call);

case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512:
Expand Down
54 changes: 54 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11615,6 +11615,51 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
return true;
}

static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is really a shame we are duplicating code so much. If we find a bug in this implementation, will folks really remember to fix it in both places? @RKSimon @tbaederr

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially explored evalPshufBuiltin, but since the function handles the control as a scalar and applies different shuffle logic, integrating the pshufb seemed to require heavy conditional branching which i thought might complicate readability. However, this is my first LLVM contribution and i completely understand if i am missing some broader context. I am happy to rework this or refactor both implementations together if that would be a good long term solution.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have been looking at options for using a callback mechanism for shuffle mask decoding - similar to what we do for binops - I've suggested this to @chaitanyav on #164078 and we can build on that.

APValue &Out) {
APValue SrcVec, ControlVec;
if (!EvaluateAsRValue(Info, Call->getArg(0), SrcVec))
return false;
if (!EvaluateAsRValue(Info, Call->getArg(1), ControlVec))
return false;

const auto *VT = Call->getType()->getAs<VectorType>();
if (!VT)
return false;

QualType ElemT = VT->getElementType();
unsigned ElemBits = Info.Ctx.getTypeSize(ElemT);

if (ElemBits != 8)
return false;
unsigned NumElts = VT->getNumElements();
if (NumElts != 16 && NumElts != 32 && NumElts != 64)
return false;

SmallVector<APValue, 64> ResultElements;
ResultElements.reserve(NumElts);

for (unsigned Idx = 0; Idx != NumElts; ++Idx) {
APValue CtlVal = ControlVec.getVectorElt(Idx);
APSInt CtlByte = CtlVal.getInt();
uint8_t Ctl = static_cast<uint8_t>(CtlByte.getZExtValue() & 0xFF);

if (Ctl & 0x80) {
APSInt Zero(ElemBits, /*isUnsigned*/ false);
Zero = 0;
ResultElements.push_back(APValue(Zero));
} else {
unsigned LaneBase = (Idx / 16) * 16;
unsigned SrcOffset = Ctl & 0x0F;
unsigned SrcIdx = LaneBase + SrcOffset;

ResultElements.push_back(SrcVec.getVectorElt(SrcIdx));
}
}
Out = APValue(ResultElements.data(), ResultElements.size());
return true;
}

static bool evalPshufBuiltin(EvalInfo &Info, const CallExpr *Call,
bool IsShufHW, APValue &Out) {
APValue Vec;
Expand Down Expand Up @@ -12189,6 +12234,15 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}

case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512: {
APValue R;
if (!evalPshufbBuiltin(Info, E, R))
return false;
return Success(R, E);
}

case X86::BI__builtin_ia32_pshuflw:
case X86::BI__builtin_ia32_pshuflw256:
case X86::BI__builtin_ia32_pshuflw512: {
Expand Down
5 changes: 2 additions & 3 deletions clang/lib/Headers/avx2intrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1858,9 +1858,8 @@ _mm256_sad_epu8(__m256i __a, __m256i __b)
/// control byte specify the index (within the same 128-bit half) of \a __a
/// to copy to the result byte.
/// \returns A 256-bit integer vector containing the result.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_shuffle_epi8(__m256i __a, __m256i __b)
{
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
}

Expand Down
15 changes: 6 additions & 9 deletions clang/lib/Headers/avx512bwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -866,23 +866,20 @@ _mm512_mask_min_epu16(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) {
(__v32hi)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_shuffle_epi8(__m512i __A, __m512i __B)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
(__v64qi)_mm512_shuffle_epi8(__A, __B),
(__v64qi)__W);
}

static __inline__ __m512i __DEFAULT_FN_ATTRS512
_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B)
{
static __inline__ __m512i __DEFAULT_FN_ATTRS512_CONSTEXPR
_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U,
(__v64qi)_mm512_shuffle_epi8(__A, __B),
(__v64qi)_mm512_setzero_si512());
Expand Down
20 changes: 8 additions & 12 deletions clang/lib/Headers/avx512vlbwintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -1067,33 +1067,29 @@ _mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) {
(__v16hi)__W);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B)
{
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
(__v16qi)_mm_shuffle_epi8(__A, __B),
(__v16qi)__W);
}

static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B)
{
static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U,
(__v16qi)_mm_shuffle_epi8(__A, __B),
(__v16qi)_mm_setzero_si128());
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B)
{
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
(__v32qi)_mm256_shuffle_epi8(__A, __B),
(__v32qi)__W);
}

static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B)
{
static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U,
(__v32qi)_mm256_shuffle_epi8(__A, __B),
(__v32qi)_mm256_setzero_si256());
Expand Down
21 changes: 10 additions & 11 deletions clang/lib/Headers/tmmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -603,10 +603,9 @@ _mm_mulhrs_pi16(__m64 __a, __m64 __b)
/// Bits [6:4] Reserved. \n
/// Bits [3:0] select the source byte to be copied.
/// \returns A 128-bit integer vector containing the copied or cleared values.
static __inline__ __m128i __DEFAULT_FN_ATTRS
_mm_shuffle_epi8(__m128i __a, __m128i __b)
{
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_shuffle_epi8(__m128i __a, __m128i __b) {
return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
}

/// Copies the 8-bit integers from a 64-bit integer vector to the
Expand All @@ -628,13 +627,13 @@ _mm_shuffle_epi8(__m128i __a, __m128i __b)
/// destination. \n
/// Bits [2:0] select the source byte to be copied.
/// \returns A 64-bit integer vector containing the copied or cleared values.
static __inline__ __m64 __DEFAULT_FN_ATTRS
_mm_shuffle_pi8(__m64 __a, __m64 __b)
{
return __trunc64(__builtin_ia32_pshufb128(
(__v16qi)__builtin_shufflevector(
(__v2si)(__a), __extension__ (__v2si){}, 0, 1, 0, 1),
(__v16qi)__anyext128(__b)));
static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR
_mm_shuffle_pi8(__m64 __a, __m64 __b) {
return __trunc64(__builtin_ia32_pshufb128(
(__v16qi)__builtin_shufflevector((__v2si)(__a), __extension__(__v2si){},
0, 1, 0, 1),
(__v16qi)__builtin_shufflevector((__v2si)(__b), __extension__(__v2si){},
0, 1, 0, 1)));
}

/// For each 8-bit integer in the first source operand, perform one of
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGen/X86/avx2-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,8 @@ __m256i test_mm256_shuffle_epi8(__m256i a, __m256i b) {
return _mm256_shuffle_epi8(a, b);
}

TEST_CONSTEXPR(match_v32qi(_mm256_shuffle_epi8((__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){0,33,2,35,4,37,6,39,8,41,10,43,12,45,14,47,16,49,18,51,20,53,22,55,24,57,26,59,28,61,30,63}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31));

__m256i test_mm256_shuffle_epi32(__m256i a) {
// CHECK-LABEL: test_mm256_shuffle_epi32
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <8 x i32> <i32 3, i32 3, i32 0, i32 0, i32 7, i32 7, i32 4, i32 4>
Expand Down
9 changes: 9 additions & 0 deletions clang/test/CodeGen/X86/avx512bw-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1466,18 +1466,27 @@ __m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.pshuf.b.512
return _mm512_shuffle_epi8(__A,__B);
}

TEST_CONSTEXPR(match_v64qi(_mm512_shuffle_epi8((__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95}), 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63));

__m512i test_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_mask_shuffle_epi8
// CHECK: @llvm.x86.avx512.pshuf.b.512
// CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
return _mm512_mask_shuffle_epi8(__W,__U,__A,__B);
}

TEST_CONSTEXPR(match_v64qi(_mm512_mask_shuffle_epi8((__m512i)(__v64qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7,8,8,8,8,8,8,8,8}, 0xFFFFFFFF00000000, (__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32,63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48));

__m512i test_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_maskz_shuffle_epi8
// CHECK: @llvm.x86.avx512.pshuf.b.512
// CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}}
return _mm512_maskz_shuffle_epi8(__U,__A,__B);
}

TEST_CONSTEXPR(match_v64qi(_mm512_maskz_shuffle_epi8(0x8888888888888888,(__m512i)(__v64qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63}, (__m512i)(__v64qi){127,126,125,124,123,122,121,120,119,118,117,116,115,114,113,112,111,110,109,108,107,106,105,104,103,102,101,100,99,98,97,96,95,94,93,92,91,90,89,88,87,86,85,84,83,82,81,80,79,78,77,76,75,74,73,72,71,70,69,68,67,66,65,64}), 0,0,0,12,0,0,0,8,0,0,0,4,0,0,0,0,0,0,0,28,0,0,0,24,0,0,0,20,0,0,0,16,0,0,0,44,0,0,0,40,0,0,0,36,0,0,0,32,0,0,0,60,0,0,0,56,0,0,0,52,0,0,0,48));

__m512i test_mm512_subs_epi8(__m512i __A, __m512i __B) {
// CHECK-LABEL: test_mm512_subs_epi8
// CHECK: @llvm.ssub.sat.v64i8
Expand Down
13 changes: 13 additions & 0 deletions clang/test/CodeGen/X86/avx512vlbw-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1688,24 +1688,37 @@ __m128i test_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m12
// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
return _mm_mask_shuffle_epi8(__W,__U,__A,__B);
}

TEST_CONSTEXPR(match_v16qi(_mm_mask_shuffle_epi8((__m128i)(__v16qi){1,1,1,1,1,1,1,1,2,2,4,4,6,6,8,8}, 0x00FF, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,2,2,4,4,6,6,8,8));

__m128i test_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_maskz_shuffle_epi8
// CHECK: @llvm.x86.ssse3.pshuf.b
// CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}}
return _mm_maskz_shuffle_epi8(__U,__A,__B);
}

TEST_CONSTEXPR(match_v16qi(_mm_maskz_shuffle_epi8(0xAAAA, (__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}, (__m128i)(__v16qi){15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 0,14,0,12,0,10,0,8,0,6,0,4,0,2,0,0));

__m256i test_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_mask_shuffle_epi8
// CHECK: @llvm.x86.avx2.pshuf.b
// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
return _mm256_mask_shuffle_epi8(__W,__U,__A,__B);
}

TEST_CONSTEXPR(match_v32qi(_mm256_mask_shuffle_epi8((__m256i)(__v32qi){1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4}, 0x80808080, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 1,1,1,1,1,1,1,8,2,2,2,2,2,2,2,0,3,3,3,3,3,3,3,24,4,4,4,4,4,4,4,16));


__m256i test_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) {
// CHECK-LABEL: test_mm256_maskz_shuffle_epi8
// CHECK: @llvm.x86.avx2.pshuf.b
// CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}}
return _mm256_maskz_shuffle_epi8(__U,__A,__B);
}

TEST_CONSTEXPR(match_v32qi(_mm256_maskz_shuffle_epi8(0x0000FFFF, (__m256i)(__v32qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31}, (__m256i)(__v32qi){31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0}), 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0));

__m128i test_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_subs_epi8
// CHECK: @llvm.ssub.sat.v16i8
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGen/X86/mmx-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,8 @@ __m64 test_mm_shuffle_pi8(__m64 a, __m64 b) {
return _mm_shuffle_pi8(a, b);
}

TEST_CONSTEXPR(match_v8qi(_mm_shuffle_pi8((__m64)(__v8qi){0,1,2,3,4,5,6,7}, (__m64)(__v8qi){10,20,30,40,50,60,70,80}), 2,4,6,0,2,4,6,0));

__m64 test_mm_shuffle_pi16(__m64 a) {
// CHECK-LABEL: test_mm_shuffle_pi16
// CHECK: shufflevector <4 x i16> {{%.*}}, <4 x i16> {{%.*}}, <4 x i32> <i32 3, i32 0, i32 0, i32 0>
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGen/X86/ssse3-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ __m128i test_mm_shuffle_epi8(__m128i a, __m128i b) {
return _mm_shuffle_epi8(a, b);
}

TEST_CONSTEXPR(match_v16qi(_mm_shuffle_epi8((__m128i)(__v16qi){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,char(-15)}, (__m128i)(__v16qi){15,char(-14),13,12,11,10,9,8,7,6,5,4,3,2,1,0}), -15,0,13,12,11,10,9,8,7,6,5,4,3,2,1,0));

__m128i test_mm_sign_epi8(__m128i a, __m128i b) {
// CHECK-LABEL: test_mm_sign_epi8
// CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
Expand Down