Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW
}

let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
Expand All @@ -342,6 +341,8 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]

let Features = "sse4.1",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, "
"_Vector<4, float>, _Constant char)">;
def ptestz128
: X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
def ptestc128
Expand Down
42 changes: 37 additions & 5 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3411,7 +3411,7 @@ static bool interp__builtin_x86_byteshift(

static bool interp__builtin_ia32_shuffle_generic(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
GetSourceIndex) {

assert(Call->getNumArgs() == 3);
Expand All @@ -3428,8 +3428,19 @@ static bool interp__builtin_ia32_shuffle_generic(

for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });

if (SrcIdx < 0) {
// Zero out this element
if (ElemT == PT_Float) {
Dst.elem<Floating>(DstIdx) = Floating(
S.getASTContext().getFloatTypeSemantics(VecT->getElementType()));
} else {
INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(DstIdx) = T::from(0); });
}
} else {
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
}
}
Dst.initializeAllElements();

Expand Down Expand Up @@ -4382,7 +4393,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
return std::pair<unsigned, int>{SrcIdx,
static_cast<int>(LaneOffset + Index)};
});
case X86::BI__builtin_ia32_shufpd:
case X86::BI__builtin_ia32_shufpd256:
Expand All @@ -4400,7 +4412,27 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
return std::pair<unsigned, int>{SrcIdx,
static_cast<int>(LaneOffset + Index)};
});
case X86::BI__builtin_ia32_insertps128:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) {
// Bits [3:0]: zero mask - if bit is set, zero this element
if ((Mask & (1 << DstIdx)) != 0) {
return std::pair<unsigned, int>{0, -1};
}
// Bits [7:6]: select element from source vector Y (0-3)
// Bits [5:4]: select destination position (0-3)
unsigned SrcElem = (Mask >> 6) & 0x3;
unsigned DstElem = (Mask >> 4) & 0x3;
if (DstIdx == DstElem) {
// Insert element from source vector (B) at this position
return std::pair<unsigned, int>{1, static_cast<int>(SrcElem)};
} else {
// Copy from destination vector (A)
return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)};
}
});
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
Expand Down
46 changes: 39 additions & 7 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11621,7 +11621,7 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,

static bool evalShuffleGeneric(
EvalInfo &Info, const CallExpr *Call, APValue &Out,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
GetSourceIndex) {

const auto *VT = Call->getType()->getAs<VectorType>();
Expand All @@ -11644,8 +11644,16 @@ static bool evalShuffleGeneric(

for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const APValue &Src = (SrcVecIdx == 0) ? A : B;
ResultElements.push_back(Src.getVectorElt(SrcIdx));

if (SrcIdx < 0) {
// Zero out this element
QualType ElemTy = VT->getElementType();
ResultElements.push_back(
APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
} else {
const APValue &Src = (SrcVecIdx == 0) ? A : B;
ResultElements.push_back(Src.getVectorElt(SrcIdx));
}
}

Out = APValue(ResultElements.data(), ResultElements.size());
Expand Down Expand Up @@ -12438,7 +12446,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (!evalShuffleGeneric(
Info, E, R,
[](unsigned DstIdx,
unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
unsigned ShuffleMask) -> std::pair<unsigned, int> {
constexpr unsigned LaneBits = 128u;
unsigned NumElemPerLane = LaneBits / 32;
unsigned NumSelectableElems = NumElemPerLane / 2;
Expand All @@ -12451,7 +12459,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return {SrcIdx, LaneOffset + Index};
return {SrcIdx, static_cast<int>(LaneOffset + Index)};
}))
return false;
return Success(R, E);
Expand All @@ -12463,7 +12471,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
if (!evalShuffleGeneric(
Info, E, R,
[](unsigned DstIdx,
unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
unsigned ShuffleMask) -> std::pair<unsigned, int> {
constexpr unsigned LaneBits = 128u;
unsigned NumElemPerLane = LaneBits / 64;
unsigned NumSelectableElems = NumElemPerLane / 2;
Expand All @@ -12476,7 +12484,31 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return {SrcIdx, LaneOffset + Index};
return {SrcIdx, static_cast<int>(LaneOffset + Index)};
}))
return false;
return Success(R, E);
}
case X86::BI__builtin_ia32_insertps128: {
APValue R;
if (!evalShuffleGeneric(
Info, E, R,
[](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
// Bits [3:0]: zero mask - if bit is set, zero this element
if ((Mask & (1 << DstIdx)) != 0) {
return {0, -1};
}
// Bits [7:6]: select element from source vector Y (0-3)
// Bits [5:4]: select destination position (0-3)
unsigned SrcElem = (Mask >> 6) & 0x3;
unsigned DstElem = (Mask >> 4) & 0x3;
if (DstIdx == DstElem) {
// Insert element from source vector (B) at this position
return {1, static_cast<int>(SrcElem)};
} else {
// Copy from destination vector (A)
return {0, static_cast<int>(DstIdx)};
}
}))
return false;
return Success(R, E);
Expand Down
10 changes: 10 additions & 0 deletions clang/test/CodeGen/X86/sse41-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) {
return _mm_insert_ps(x, y, 4);
}

TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all

__m128i test_mm_max_epi8(__m128i x, __m128i y) {
// CHECK-LABEL: test_mm_max_epi8
// CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
Expand Down
Loading