From 019b3ba36b8205b57135ab22b4603fee7d453945 Mon Sep 17 00:00:00 2001 From: Brandon Xin Date: Tue, 9 Sep 2025 19:26:11 -0500 Subject: [PATCH 1/7] [X86][bytecode] Allow SSE/AVX BLEND imm intrinsics to be used in constexpr --- clang/include/clang/Basic/BuiltinsX86.td | 17 +++++---- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 48 ++++++++++++++++++++++++ clang/lib/AST/ExprConstant.cpp | 34 +++++++++++++++++ clang/test/CodeGen/X86/avx-builtins.c | 8 ++++ clang/test/CodeGen/X86/avx2-builtins.c | 12 ++++++ clang/test/CodeGen/X86/sse41-builtins.c | 13 ++++++- 6 files changed, 123 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 358dfdb3ea421..6accb218231cb 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -312,9 +312,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in { def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">; - def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">; - def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">; - def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">; def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">; def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; @@ -333,6 +330,9 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] } let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in { + def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">; + def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">; + def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">; def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">; def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">; def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">; @@ -469,8 +469,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">; def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">; def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; - def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; - def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">; @@ -495,6 +493,8 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in } let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { + def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; + def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">; def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">; } @@ -575,7 +575,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">; - def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">; def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">; @@ -604,8 +603,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">; def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">; def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">; - def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">; - def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">; def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">; def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; @@ -619,6 +616,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">; def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">; + def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">; + def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; + def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">; + def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">; def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index d418e0ac5d094..fef9373191451 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2831,6 +2831,44 @@ static bool interp__builtin_select(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, + const CallExpr *Call, unsigned BuiltinID) { + PrimType MaskT = *S.getContext().classify(Call->getArg(2)); + APSInt Mask = popToAPSInt(S.Stk, MaskT); + const Pointer &TrueElem = S.Stk.pop(); + const Pointer &FalseElem = S.Stk.pop(); + const Pointer &Dst = S.Stk.peek(); + + assert(FalseElem.getNumElems() == TrueElem.getNumElems()); + assert(FalseElem.getNumElems() == Dst.getNumElems()); + unsigned NumElems = FalseElem.getNumElems(); + PrimType ElemT = FalseElem.getFieldDesc()->getPrimType(); + PrimType DstElemT = Dst.getFieldDesc()->getPrimType(); + + auto BitIndex = BuiltinID == X86::BI__builtin_ia32_pblendw256 + ? [](unsigned I) { return I % 8; } + : [](unsigned I) { return I; }; + for (unsigned I = 0; I != NumElems; ++I) { + bool MaskBit = Mask[BitIndex(I)]; + if (ElemT == PT_Float) { + assert(DstElemT == PT_Float); + Dst.elem(I) = + MaskBit ? TrueElem.elem(I) : FalseElem.elem(I); + } else { + APSInt Elem; + INT_TYPE_SWITCH(ElemT, { + Elem = MaskBit ? TrueElem.elem(I).toAPSInt() + : FalseElem.elem(I).toAPSInt(); + }); + INT_TYPE_SWITCH_NO_BOOL(DstElemT, + { Dst.elem(I) = static_cast(Elem); }); + } + } + Dst.initializeAllElements(); + + return true; +} + static bool interp__builtin_elementwise_triop( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_ref @@ -3496,6 +3534,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return llvm::APIntOps::fshr(Hi, Lo, Amt); }); + case clang::X86::BI__builtin_ia32_blendpd: + case clang::X86::BI__builtin_ia32_blendpd256: + case clang::X86::BI__builtin_ia32_blendps: + case clang::X86::BI__builtin_ia32_blendps256: + case clang::X86::BI__builtin_ia32_pblendw128: + case clang::X86::BI__builtin_ia32_pblendw256: + case clang::X86::BI__builtin_ia32_pblendd128: + case clang::X86::BI__builtin_ia32_pblendd256: + return interp__builtin_blend(S, OpPC, Call, BuiltinID); + case clang::X86::BI__builtin_ia32_blendvpd: case clang::X86::BI__builtin_ia32_blendvpd256: case clang::X86::BI__builtin_ia32_blendvps: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 2376e482a19f5..63884ac815a56 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11945,6 +11945,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_blendpd: + case X86::BI__builtin_ia32_blendpd256: + case X86::BI__builtin_ia32_blendps: + case X86::BI__builtin_ia32_blendps256: + case X86::BI__builtin_ia32_pblendw128: + case X86::BI__builtin_ia32_pblendw256: + case X86::BI__builtin_ia32_pblendd128: + case X86::BI__builtin_ia32_pblendd256: { + APValue SourceF, SourceT, SourceC; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceF) || + !EvaluateAsRValue(Info, E->getArg(1), SourceT) || + !EvaluateAsRValue(Info, E->getArg(2), SourceC)) + return false; + + assert(SourceF.getKind() == clang::APValue::Vector); + assert(SourceT.getKind() == clang::APValue::Vector); + assert(SourceC.getKind() == clang::APValue::Int); + + const APInt &C = SourceC.getInt(); + auto SourceLen = SourceF.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(SourceLen); + auto BitIndex = E->getBuiltinCallee() == X86::BI__builtin_ia32_pblendw256 + ? [](unsigned I) { return I % 8; } + : [](unsigned I) { return I; }; + for (unsigned EltNum = 0; EltNum != SourceLen; ++EltNum) { + const APValue &F = SourceF.getVectorElt(EltNum); + const APValue &T = SourceT.getVectorElt(EltNum); + ResultElements.push_back(C[BitIndex(EltNum)] ? T : F); + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + case X86::BI__builtin_ia32_blendvpd: case X86::BI__builtin_ia32_blendvpd256: case X86::BI__builtin_ia32_blendvps: diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index f255dbe1b2adc..d4fa43254a3a8 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -87,12 +87,20 @@ __m256d test_mm256_blend_pd(__m256d A, __m256d B) { // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_blend_pd(A, B, 0x05); } +TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x00), 1.0, 2.0, 3.0, 4.0)); +TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x05), 5.0, 2.0, 7.0, 4.0)); +TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x0A), 1.0, 6.0, 3.0, 8.0)); +TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x0F), 5.0, 6.0, 7.0, 8.0)); __m256 test_mm256_blend_ps(__m256 A, __m256 B) { // CHECK-LABEL: test_mm256_blend_ps // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_blend_ps(A, B, 0x35); } +TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0x00), 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f)); +TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0x35), -1.0f, 2.0f, -3.0f, 4.0f, -5.0f, -6.0f, 7.0f, 8.0f)); +TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0xAA), 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f)); +TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0xFF), -1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f)); __m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) { // CHECK-LABEL: test_mm256_blendv_pd diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index aeb1aee4ea946..17ab47c72ad4b 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -146,6 +146,10 @@ __m256i test_mm256_blend_epi16(__m256i a, __m256i b) { // CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> return _mm256_blend_epi16(a, b, 2); } +TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x00), 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16)); +TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x5A), 1,-2,3,-4,-5,6,-7,8,9,-10,11,-12,-13,14,-15,16)); +TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x94), 1,2,-3,4,-5,6,7,-8,9,10,-11,12,-13,14,15,-16)); +TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0xFF), -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16)); __m128i test_mm_blend_epi32(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_blend_epi32 @@ -153,6 +157,10 @@ __m128i test_mm_blend_epi32(__m128i a, __m128i b) { // CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> return _mm_blend_epi32(a, b, 0x05); } +TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0x0), 1,2,3,4)); +TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0x5), -1,2,-3,4)); +TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0xA), 1,-2,3,-4)); +TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0xF), -1,-2,-3,-4)); __m256i test_mm256_blend_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_blend_epi32 @@ -160,6 +168,10 @@ __m256i test_mm256_blend_epi32(__m256i a, __m256i b) { // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> return _mm256_blend_epi32(a, b, 0x35); } +TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0x00), 1,2,3,4,5,6,7,8)); +TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0xA5), -1,2,-3,4,5,-6,7,-8)); +TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0x94), 1,2,-3,4,-5,6,7,-8)); +TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0xFF), -1,-2,-3,-4,-5,-6,-7,-8)); __m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) { // CHECK-LABEL: test_mm256_blendv_epi8 diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c index dca161c8038a2..c7265b188d572 100644 --- a/clang/test/CodeGen/X86/sse41-builtins.c +++ b/clang/test/CodeGen/X86/sse41-builtins.c @@ -27,18 +27,30 @@ __m128i test_mm_blend_epi16(__m128i V1, __m128i V2) { // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> return _mm_blend_epi16(V1, V2, 42); } +TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x00),1,2,3,4,5,6,7,8)); +TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x5A),1,-2,3,-4,-5,6,-7,8)); +TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x94),1,2,-3,4,-5,6,7,-8)); +TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0xFF),-1,-2,-3,-4,-5,-6,-7,-8)); __m128d test_mm_blend_pd(__m128d V1, __m128d V2) { // CHECK-LABEL: test_mm_blend_pd // CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> return _mm_blend_pd(V1, V2, 2); } +TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 0), 1.0, 2.0)); +TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 1), 3.0, 2.0)); +TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 2), 1.0, 4.0)); +TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 3), 3.0, 4.0)); __m128 test_mm_blend_ps(__m128 V1, __m128 V2) { // CHECK-LABEL: test_mm_blend_ps // CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> return _mm_blend_ps(V1, V2, 6); } +TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0x0), 1.0f, 2.0f, 3.0f, 4.0f)); +TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0x5), 5.0f, 2.0f, 7.0f, 4.0f)); +TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0xA), 1.0f, 6.0f, 3.0f, 8.0f)); +TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0xF), 5.0f, 6.0f, 7.0f, 8.0f)); __m128i test_mm_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) { // CHECK-LABEL: test_mm_blendv_epi8 @@ -459,4 +471,3 @@ int test_mm_testz_si128(__m128i x, __m128i y) { // CHECK: call {{.*}}i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}}) return _mm_testz_si128(x, y); } - From 436f0c914d70c5f5a4f582b07abb21542f8a11d0 Mon Sep 17 00:00:00 2001 From: Brandon Xin Date: Wed, 10 Sep 2025 20:29:22 -0500 Subject: [PATCH 2/7] Simplify blend index calculation --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 5 +---- clang/lib/AST/ExprConstant.cpp | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index fef9373191451..51e94ad1a1027 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2845,11 +2845,8 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, PrimType ElemT = FalseElem.getFieldDesc()->getPrimType(); PrimType DstElemT = Dst.getFieldDesc()->getPrimType(); - auto BitIndex = BuiltinID == X86::BI__builtin_ia32_pblendw256 - ? [](unsigned I) { return I % 8; } - : [](unsigned I) { return I; }; for (unsigned I = 0; I != NumElems; ++I) { - bool MaskBit = Mask[BitIndex(I)]; + bool MaskBit = Mask[I % 8]; if (ElemT == PT_Float) { assert(DstElemT == PT_Float); Dst.elem(I) = diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 63884ac815a56..3f420fa1ecd1f 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11967,13 +11967,10 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { auto SourceLen = SourceF.getVectorLength(); SmallVector ResultElements; ResultElements.reserve(SourceLen); - auto BitIndex = E->getBuiltinCallee() == X86::BI__builtin_ia32_pblendw256 - ? [](unsigned I) { return I % 8; } - : [](unsigned I) { return I; }; for (unsigned EltNum = 0; EltNum != SourceLen; ++EltNum) { const APValue &F = SourceF.getVectorElt(EltNum); const APValue &T = SourceT.getVectorElt(EltNum); - ResultElements.push_back(C[BitIndex(EltNum)] ? T : F); + ResultElements.push_back(C[EltNum % 8] ? T : F); } return Success(APValue(ResultElements.data(), ResultElements.size()), E); From 407b0101eee6d97b74a03d8abd2580071c52178c Mon Sep 17 00:00:00 2001 From: Brandon <61314499+brandonxin@users.noreply.github.com> Date: Wed, 10 Sep 2025 20:35:13 -0500 Subject: [PATCH 3/7] Update clang/lib/AST/ExprConstant.cpp Co-authored-by: Timm Baeder --- clang/lib/AST/ExprConstant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3f420fa1ecd1f..9903458211652 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11964,7 +11964,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { assert(SourceC.getKind() == clang::APValue::Int); const APInt &C = SourceC.getInt(); - auto SourceLen = SourceF.getVectorLength(); + unsigned SourceLen = SourceF.getVectorLength(); SmallVector ResultElements; ResultElements.reserve(SourceLen); for (unsigned EltNum = 0; EltNum != SourceLen; ++EltNum) { From 1bab9d19a6e82bb60a6708c29def5c7064a10ff8 Mon Sep 17 00:00:00 2001 From: Brandon Xin Date: Wed, 10 Sep 2025 20:47:49 -0500 Subject: [PATCH 4/7] Correct inappropriate naming --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 51e94ad1a1027..5d9ed5c321e84 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2835,14 +2835,14 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned BuiltinID) { PrimType MaskT = *S.getContext().classify(Call->getArg(2)); APSInt Mask = popToAPSInt(S.Stk, MaskT); - const Pointer &TrueElem = S.Stk.pop(); - const Pointer &FalseElem = S.Stk.pop(); + const Pointer &TrueVec = S.Stk.pop(); + const Pointer &FalseVec = S.Stk.pop(); const Pointer &Dst = S.Stk.peek(); - assert(FalseElem.getNumElems() == TrueElem.getNumElems()); - assert(FalseElem.getNumElems() == Dst.getNumElems()); - unsigned NumElems = FalseElem.getNumElems(); - PrimType ElemT = FalseElem.getFieldDesc()->getPrimType(); + assert(FalseVec.getNumElems() == TrueVec.getNumElems()); + assert(FalseVec.getNumElems() == Dst.getNumElems()); + unsigned NumElems = FalseVec.getNumElems(); + PrimType ElemT = FalseVec.getFieldDesc()->getPrimType(); PrimType DstElemT = Dst.getFieldDesc()->getPrimType(); for (unsigned I = 0; I != NumElems; ++I) { @@ -2850,12 +2850,12 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, if (ElemT == PT_Float) { assert(DstElemT == PT_Float); Dst.elem(I) = - MaskBit ? TrueElem.elem(I) : FalseElem.elem(I); + MaskBit ? TrueVec.elem(I) : FalseVec.elem(I); } else { APSInt Elem; INT_TYPE_SWITCH(ElemT, { - Elem = MaskBit ? TrueElem.elem(I).toAPSInt() - : FalseElem.elem(I).toAPSInt(); + Elem = MaskBit ? TrueVec.elem(I).toAPSInt() + : FalseVec.elem(I).toAPSInt(); }); INT_TYPE_SWITCH_NO_BOOL(DstElemT, { Dst.elem(I) = static_cast(Elem); }); From 09b51baa64d081211d7720a6e1db93bec76141f1 Mon Sep 17 00:00:00 2001 From: Brandon Xin Date: Thu, 11 Sep 2025 10:14:49 -0500 Subject: [PATCH 5/7] Remove unnecessary assertions --- clang/lib/AST/ExprConstant.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 9903458211652..5642eb03b0eac 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -11959,10 +11959,6 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { !EvaluateAsRValue(Info, E->getArg(2), SourceC)) return false; - assert(SourceF.getKind() == clang::APValue::Vector); - assert(SourceT.getKind() == clang::APValue::Vector); - assert(SourceC.getKind() == clang::APValue::Int); - const APInt &C = SourceC.getInt(); unsigned SourceLen = SourceF.getVectorLength(); SmallVector ResultElements; From b04e7967dc1e433505fc8664ff4a1fc532e3d7d1 Mon Sep 17 00:00:00 2001 From: Brandon Xin Date: Thu, 11 Sep 2025 13:26:23 -0500 Subject: [PATCH 6/7] Merge two `INT_TYPE_SWITCH`s. --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 5d9ed5c321e84..1fd98a5141f4f 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2852,13 +2852,12 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, Dst.elem(I) = MaskBit ? TrueVec.elem(I) : FalseVec.elem(I); } else { - APSInt Elem; - INT_TYPE_SWITCH(ElemT, { - Elem = MaskBit ? TrueVec.elem(I).toAPSInt() - : FalseVec.elem(I).toAPSInt(); + assert(DstElemT == ElemT); + INT_TYPE_SWITCH_NO_BOOL(DstElemT, { + Dst.elem(I) = + static_cast(MaskBit ? TrueVec.elem(I).toAPSInt() + : FalseVec.elem(I).toAPSInt()); }); - INT_TYPE_SWITCH_NO_BOOL(DstElemT, - { Dst.elem(I) = static_cast(Elem); }); } } Dst.initializeAllElements(); From 892db8343140b5e5d918ba77bf84d6e6c2168a20 Mon Sep 17 00:00:00 2001 From: Brandon Xin Date: Thu, 11 Sep 2025 13:35:03 -0500 Subject: [PATCH 7/7] Remove unused BuiltinID parameter --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 1fd98a5141f4f..7ee6d7aa81aca 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2832,7 +2832,7 @@ static bool interp__builtin_select(InterpState &S, CodePtr OpPC, } static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, - const CallExpr *Call, unsigned BuiltinID) { + const CallExpr *Call) { PrimType MaskT = *S.getContext().classify(Call->getArg(2)); APSInt Mask = popToAPSInt(S.Stk, MaskT); const Pointer &TrueVec = S.Stk.pop(); @@ -3538,7 +3538,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_pblendw256: case clang::X86::BI__builtin_ia32_pblendd128: case clang::X86::BI__builtin_ia32_pblendd256: - return interp__builtin_blend(S, OpPC, Call, BuiltinID); + return interp__builtin_blend(S, OpPC, Call); case clang::X86::BI__builtin_ia32_blendvpd: case clang::X86::BI__builtin_ia32_blendvpd256: