Skip to content
Merged
8 changes: 4 additions & 4 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ let Features = "sse3", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def lddqu : X86Builtin<"_Vector<16, char>(char const *)">;
}

let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def palignr128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Constant int)">;
}

Expand Down Expand Up @@ -609,8 +609,7 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid

let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, "
"_Vector<32, char>, _Constant int)">;

def psadbw256
: X86Builtin<
"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
Expand All @@ -634,6 +633,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;

def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
Expand Down Expand Up @@ -3267,7 +3267,7 @@ let Features = "avx512bw", Attributes = [NoThrow, Const] in {
def kmovq : X86Builtin<"unsigned long long int(unsigned long long int)">;
}

let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def palignr512 : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Vector<64, char>, _Constant int)">;
}

Expand Down
50 changes: 39 additions & 11 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3411,7 +3411,7 @@ static bool interp__builtin_x86_byteshift(

static bool interp__builtin_ia32_shuffle_generic(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned, unsigned)>
GetSourceIndex) {

assert(Call->getNumArgs() == 3);
Expand Down Expand Up @@ -3455,7 +3455,7 @@ static bool interp__builtin_ia32_shuffle_generic(
ShuffleMask = static_cast<unsigned>(MaskVector.elem<T>(DstIdx));
});
}
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask, NumElems);

if (SrcIdx < 0) {
// Zero out this element
Expand Down Expand Up @@ -4409,7 +4409,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_shufps256:
case X86::BI__builtin_ia32_shufps512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
unsigned NumElemPerLane = 4;
unsigned NumSelectableElems = NumElemPerLane / 2;
unsigned BitsPerElem = 2;
Expand All @@ -4428,7 +4429,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_shufpd256:
case X86::BI__builtin_ia32_shufpd512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
unsigned NumElemPerLane = 2;
unsigned NumSelectableElems = NumElemPerLane / 2;
unsigned BitsPerElem = 1;
Expand All @@ -4445,7 +4447,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
});
case X86::BI__builtin_ia32_insertps128:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) {
S, OpPC, Call, [](unsigned DstIdx, unsigned Mask, unsigned NumElems) {
// Bits [3:0]: zero mask - if bit is set, zero this element
if ((Mask & (1 << DstIdx)) != 0) {
return std::pair<unsigned, int>{0, -1};
Expand All @@ -4465,7 +4467,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_vpermi2varq128:
case X86::BI__builtin_ia32_vpermi2varpd128:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
int Offset = ShuffleMask & 0x1;
unsigned SrcIdx = (ShuffleMask >> 1) & 0x1;
return std::pair<unsigned, int>{SrcIdx, Offset};
Expand All @@ -4475,7 +4478,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_vpermi2varq256:
case X86::BI__builtin_ia32_vpermi2varpd256:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
int Offset = ShuffleMask & 0x3;
unsigned SrcIdx = (ShuffleMask >> 2) & 0x1;
return std::pair<unsigned, int>{SrcIdx, Offset};
Expand All @@ -4486,7 +4490,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_vpermi2varq512:
case X86::BI__builtin_ia32_vpermi2varpd512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
int Offset = ShuffleMask & 0x7;
unsigned SrcIdx = (ShuffleMask >> 3) & 0x1;
return std::pair<unsigned, int>{SrcIdx, Offset};
Expand All @@ -4496,22 +4501,25 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_vpermi2vard512:
case X86::BI__builtin_ia32_vpermi2varps512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
int Offset = ShuffleMask & 0xF;
unsigned SrcIdx = (ShuffleMask >> 4) & 0x1;
return std::pair<unsigned, int>{SrcIdx, Offset};
});
case X86::BI__builtin_ia32_vpermi2varqi256:
case X86::BI__builtin_ia32_vpermi2varhi512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
int Offset = ShuffleMask & 0x1F;
unsigned SrcIdx = (ShuffleMask >> 5) & 0x1;
return std::pair<unsigned, int>{SrcIdx, Offset};
});
case X86::BI__builtin_ia32_vpermi2varqi512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
S, OpPC, Call,
[](unsigned DstIdx, unsigned ShuffleMask, unsigned NumElems) {
int Offset = ShuffleMask & 0x3F;
unsigned SrcIdx = (ShuffleMask >> 6) & 0x1;
return std::pair<unsigned, int>{SrcIdx, Offset};
Expand Down Expand Up @@ -4718,6 +4726,26 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return APInt(8, 0);
});

case X86::BI__builtin_ia32_palignr128:
case X86::BI__builtin_ia32_palignr256:
case X86::BI__builtin_ia32_palignr512:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned Shift, unsigned NumElems) {
// Default to -1 → zero-fill this destination element
unsigned VecIdx = 0;
int ElemIdx = -1;

// Elements come from VecB first, then VecA after the shift boundary
unsigned ShiftedIdx = DstIdx + Shift;
if (ShiftedIdx < NumElems) { // from VecB
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These should be NumElemsPerLane - given the palign builtins are hardcoded to <X x i8> types - I think you can just assume NumElemsPerLane == 16 (and remove the NumElems arg again from interp__builtin_ia32_shuffle_generic - sorry about that!).

VecIdx = 1;
ElemIdx = DstIdx + Shift;
} else if (ShiftedIdx < 2 * NumElems) { // from VecA
ElemIdx = DstIdx + Shift - NumElems;
}
return std::pair<unsigned, int>{VecIdx, ElemIdx};
});

default:
S.FFDiag(S.Current->getLocation(OpPC),
diag::note_invalid_subexpr_in_const_expr)
Expand Down
Loading