-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow insertps intrinsic to be used in constexp #165513
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…nsertps intrinsic to be used in constexpr
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: Ahmed Nour (ahmednoursphinx) ChangesThis PR Resolves #165161 Full diff: https://github.com/llvm/llvm-project/pull/165513.diff 4 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 0c85e280e748b..a431fc36b41c1 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -327,8 +327,11 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW
}
}
-let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
+}
+
+let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 8f23001ea5a39..b1f0832860476 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3358,7 +3358,8 @@ static bool interp__builtin_x86_byteshift(
static bool interp__builtin_ia32_shuffle_generic(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
- GetSourceIndex) {
+ GetSourceIndex,
+ llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) {
assert(Call->getNumArgs() == 3);
unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
@@ -3373,9 +3374,20 @@ static bool interp__builtin_ia32_shuffle_generic(
const Pointer &Dst = S.Stk.peek<Pointer>();
for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
- auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
- const Pointer &Src = (SrcVecIdx == 0) ? A : B;
- TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
+ if (ShouldZero && ShouldZero(DstIdx, ShuffleMask)) {
+ // Zero out this element
+ if (ElemT == PT_Float) {
+ Dst.elem<Floating>(DstIdx) = Floating(S.getASTContext().getFloatTypeSemantics(VecT->getElementType()));
+ } else {
+ INT_TYPE_SWITCH_NO_BOOL(ElemT, {
+ Dst.elem<T>(DstIdx) = T::from(0);
+ });
+ }
+ } else {
+ auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
+ const Pointer &Src = (SrcVecIdx == 0) ? A : B;
+ TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
+ }
}
Dst.initializeAllElements();
@@ -4348,6 +4360,26 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
});
+ case X86::BI__builtin_ia32_insertps128:
+ return interp__builtin_ia32_shuffle_generic(
+ S, OpPC, Call,
+ [](unsigned DstIdx, unsigned Mask) {
+ // Bits [7:6]: select element from source vector Y (0-3)
+ // Bits [5:4]: select destination position (0-3)
+ unsigned SrcElem = (Mask >> 6) & 0x3;
+ unsigned DstElem = (Mask >> 4) & 0x3;
+ if (DstIdx == DstElem) {
+ // Insert element from source vector (B) at this position
+ return std::pair<unsigned, unsigned>{1, SrcElem};
+ } else {
+ // Copy from destination vector (A)
+ return std::pair<unsigned, unsigned>{0, DstIdx};
+ }
+ },
+ [](unsigned DstIdx, unsigned Mask) {
+ // Bits [3:0]: zero mask
+ return (Mask & (1 << DstIdx)) != 0;
+ });
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 29ee089505125..17c966b8c9f4c 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11622,7 +11622,8 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
static bool evalShuffleGeneric(
EvalInfo &Info, const CallExpr *Call, APValue &Out,
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
- GetSourceIndex) {
+ GetSourceIndex,
+ llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) {
const auto *VT = Call->getType()->getAs<VectorType>();
if (!VT)
@@ -11643,9 +11644,15 @@ static bool evalShuffleGeneric(
ResultElements.reserve(NumElts);
for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
- auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
- const APValue &Src = (SrcVecIdx == 0) ? A : B;
- ResultElements.push_back(Src.getVectorElt(SrcIdx));
+ if (ShouldZero && ShouldZero(DstIdx, ShuffleMask)) {
+ // Zero out this element
+ QualType ElemTy = VT->getElementType();
+ ResultElements.push_back(APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
+ } else {
+ auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
+ const APValue &Src = (SrcVecIdx == 0) ? A : B;
+ ResultElements.push_back(Src.getVectorElt(SrcIdx));
+ }
}
Out = APValue(ResultElements.data(), ResultElements.size());
@@ -12481,6 +12488,30 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return false;
return Success(R, E);
}
+ case X86::BI__builtin_ia32_insertps128: {
+ APValue R;
+ if (!evalShuffleGeneric(
+ Info, E, R,
+ [](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, unsigned> {
+ // Bits [7:6]: select element from source vector Y (0-3)
+ // Bits [5:4]: select destination position (0-3)
+ unsigned SrcElem = (Mask >> 6) & 0x3;
+ unsigned DstElem = (Mask >> 4) & 0x3;
+ if (DstIdx == DstElem) {
+ // Insert element from source vector (B) at this position
+ return {1, SrcElem};
+ } else {
+ // Copy from destination vector (A)
+ return {0, DstIdx};
+ }
+ },
+ [](unsigned DstIdx, unsigned Mask) -> bool {
+ // Bits [3:0]: zero mask
+ return (Mask & (1 << DstIdx)) != 0;
+ }))
+ return false;
+ return Success(R, E);
+ }
case X86::BI__builtin_ia32_pshufb128:
case X86::BI__builtin_ia32_pshufb256:
case X86::BI__builtin_ia32_pshufb512: {
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 62cd392824bb2..35fa65a99836b 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) {
return _mm_insert_ps(x, y, 4);
}
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3]
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all
+TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all
+
__m128i test_mm_max_epi8(__m128i x, __m128i y) {
// CHECK-LABEL: test_mm_max_epi8
// CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
|
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
| llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)> | ||
| GetSourceIndex) { | ||
| GetSourceIndex, | ||
| llvm::function_ref<bool(unsigned, unsigned)> ShouldZero = nullptr) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could just adjust the GetSourceIndex signature to either be std::tuple<unsigned, unsigned, bool> or std::pair<unsigned, int> and treat negative cases as zero.
There will be other shuffles (PSHUFB in particular) that will want zero handling as well.
CC @chaitanyav
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks - the single callback approach looks better to me
clang/lib/AST/ExprConstant.cpp
Outdated
| EvalInfo &Info, const CallExpr *Call, APValue &Out, | ||
| llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)> | ||
| GetSourceIndex) { | ||
| GetSourceIndex, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove ShouldZero and update this callback to match interp__builtin_ia32_shuffle_generic
|
Hey @RKSimon, Thanks for the feedback |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
… insertps intrinsic to be used in constexp (llvm#165513) Resolves llvm#165161
… insertps intrinsic to be used in constexp (llvm#165513) Resolves llvm#165161
This PR Resolves #165161