Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW
}

let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
Expand All @@ -342,6 +341,8 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]

let Features = "sse4.1",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, "
"_Vector<4, float>, _Constant char)">;
def ptestz128
: X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
def ptestc128
Expand Down
42 changes: 37 additions & 5 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3411,7 +3411,7 @@ static bool interp__builtin_x86_byteshift(

static bool interp__builtin_ia32_shuffle_generic(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
GetSourceIndex) {

assert(Call->getNumArgs() == 3);
Expand All @@ -3428,8 +3428,19 @@ static bool interp__builtin_ia32_shuffle_generic(

for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });

if (SrcIdx < 0) {
// Zero out this element
if (ElemT == PT_Float) {
Dst.elem<Floating>(DstIdx) = Floating(
S.getASTContext().getFloatTypeSemantics(VecT->getElementType()));
} else {
INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(DstIdx) = T::from(0); });
}
} else {
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
}
}
Dst.initializeAllElements();

Expand Down Expand Up @@ -4382,7 +4393,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
return std::pair<unsigned, int>{SrcIdx,
static_cast<int>(LaneOffset + Index)};
});
case X86::BI__builtin_ia32_shufpd:
case X86::BI__builtin_ia32_shufpd256:
Expand All @@ -4400,7 +4412,27 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
return std::pair<unsigned, int>{SrcIdx,
static_cast<int>(LaneOffset + Index)};
});
case X86::BI__builtin_ia32_insertps128:
return interp__builtin_ia32_shuffle_generic(
S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) {
// Bits [3:0]: zero mask - if bit is set, zero this element
if ((Mask & (1 << DstIdx)) != 0) {
return std::pair<unsigned, int>{0, -1};
}
// Bits [7:6]: select element from source vector Y (0-3)
// Bits [5:4]: select destination position (0-3)
unsigned SrcElem = (Mask >> 6) & 0x3;
unsigned DstElem = (Mask >> 4) & 0x3;
if (DstIdx == DstElem) {
// Insert element from source vector (B) at this position
return std::pair<unsigned, int>{1, static_cast<int>(SrcElem)};
} else {
// Copy from destination vector (A)
return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)};
}
});
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
Expand Down
41 changes: 37 additions & 4 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11622,7 +11622,8 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
static bool evalShuffleGeneric(
EvalInfo &Info, const CallExpr *Call, APValue &Out,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
GetSourceIndex) {
GetSourceIndex,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove ShouldZero and update this callback to match interp__builtin_ia32_shuffle_generic

llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) {

const auto *VT = Call->getType()->getAs<VectorType>();
if (!VT)
Expand All @@ -11643,9 +11644,16 @@ static bool evalShuffleGeneric(
ResultElements.reserve(NumElts);

for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const APValue &Src = (SrcVecIdx == 0) ? A : B;
ResultElements.push_back(Src.getVectorElt(SrcIdx));
if (ShouldZero && ShouldZero(DstIdx, ShuffleMask)) {
// Zero out this element
QualType ElemTy = VT->getElementType();
ResultElements.push_back(
APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
} else {
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
const APValue &Src = (SrcVecIdx == 0) ? A : B;
ResultElements.push_back(Src.getVectorElt(SrcIdx));
}
}

Out = APValue(ResultElements.data(), ResultElements.size());
Expand Down Expand Up @@ -12481,6 +12489,31 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return false;
return Success(R, E);
}
case X86::BI__builtin_ia32_insertps128: {
APValue R;
if (!evalShuffleGeneric(
Info, E, R,
[](unsigned DstIdx,
unsigned Mask) -> std::pair<unsigned, unsigned> {
// Bits [7:6]: select element from source vector Y (0-3)
// Bits [5:4]: select destination position (0-3)
unsigned SrcElem = (Mask >> 6) & 0x3;
unsigned DstElem = (Mask >> 4) & 0x3;
if (DstIdx == DstElem) {
// Insert element from source vector (B) at this position
return {1, SrcElem};
} else {
// Copy from destination vector (A)
return {0, DstIdx};
}
},
[](unsigned DstIdx, unsigned Mask) -> bool {
// Bits [3:0]: zero mask
return (Mask & (1 << DstIdx)) != 0;
}))
return false;
return Success(R, E);
}
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512: {
Expand Down
10 changes: 10 additions & 0 deletions clang/test/CodeGen/X86/sse41-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) {
return _mm_insert_ps(x, y, 4);
}

TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3]
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all

__m128i test_mm_max_epi8(__m128i x, __m128i y) {
// CHECK-LABEL: test_mm_max_epi8
// CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
Expand Down