Skip to content

Commit 019b3ba

Browse files
committed
[X86][bytecode] Allow SSE/AVX BLEND imm intrinsics to be used in constexpr
1 parent ee5bc57 commit 019b3ba

File tree

6 files changed

+123
-9
lines changed

6 files changed

+123
-9
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -312,9 +312,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
312312

313313
let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
314314
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
315-
def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
316-
def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
317-
def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
318315
def packusdw128 : X86Builtin<"_Vector<8, short>(_Vector<4, int>, _Vector<4, int>)">;
319316
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
320317
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
@@ -333,6 +330,9 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
333330
}
334331

335332
let Features = "sse4.1", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
333+
def pblendw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>, _Constant int)">;
334+
def blendpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
335+
def blendps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
336336
def blendvpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Vector<2, double>)">;
337337
def blendvps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Vector<4, float>)">;
338338
def pblendvb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>, _Vector<16, char>)">;
@@ -469,8 +469,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
469469
def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
470470
def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
471471
def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
472-
def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
473-
def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
474472
def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
475473
def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
476474
def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
@@ -495,6 +493,8 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
495493
}
496494

497495
let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
496+
def blendpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
497+
def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
498498
def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
499499
def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
500500
}
@@ -575,7 +575,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
575575
def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">;
576576
def packusdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">;
577577
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
578-
def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
579578
def phaddw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
580579
def phaddd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
581580
def phaddsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
@@ -604,8 +603,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
604603
def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
605604
def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
606605
def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
607-
def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
608-
def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
609606
def permvarsi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
610607
def permdf256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
611608
def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
@@ -619,6 +616,10 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
619616
def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
620617
def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
621618

619+
def pblendd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>, _Constant int)">;
620+
def pblendd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">;
621+
def pblendw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>, _Constant int)">;
622+
622623
def pblendvb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Vector<32, char>)">;
623624

624625
def pmuldq256 : X86Builtin<"_Vector<4, long long int>(_Vector<8, int>, _Vector<8, int>)">;

clang/lib/AST/ByteCode/InterpBuiltin.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2831,6 +2831,44 @@ static bool interp__builtin_select(InterpState &S, CodePtr OpPC,
28312831
return true;
28322832
}
28332833

2834+
static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
2835+
const CallExpr *Call, unsigned BuiltinID) {
2836+
PrimType MaskT = *S.getContext().classify(Call->getArg(2));
2837+
APSInt Mask = popToAPSInt(S.Stk, MaskT);
2838+
const Pointer &TrueElem = S.Stk.pop<Pointer>();
2839+
const Pointer &FalseElem = S.Stk.pop<Pointer>();
2840+
const Pointer &Dst = S.Stk.peek<Pointer>();
2841+
2842+
assert(FalseElem.getNumElems() == TrueElem.getNumElems());
2843+
assert(FalseElem.getNumElems() == Dst.getNumElems());
2844+
unsigned NumElems = FalseElem.getNumElems();
2845+
PrimType ElemT = FalseElem.getFieldDesc()->getPrimType();
2846+
PrimType DstElemT = Dst.getFieldDesc()->getPrimType();
2847+
2848+
auto BitIndex = BuiltinID == X86::BI__builtin_ia32_pblendw256
2849+
? [](unsigned I) { return I % 8; }
2850+
: [](unsigned I) { return I; };
2851+
for (unsigned I = 0; I != NumElems; ++I) {
2852+
bool MaskBit = Mask[BitIndex(I)];
2853+
if (ElemT == PT_Float) {
2854+
assert(DstElemT == PT_Float);
2855+
Dst.elem<Floating>(I) =
2856+
MaskBit ? TrueElem.elem<Floating>(I) : FalseElem.elem<Floating>(I);
2857+
} else {
2858+
APSInt Elem;
2859+
INT_TYPE_SWITCH(ElemT, {
2860+
Elem = MaskBit ? TrueElem.elem<T>(I).toAPSInt()
2861+
: FalseElem.elem<T>(I).toAPSInt();
2862+
});
2863+
INT_TYPE_SWITCH_NO_BOOL(DstElemT,
2864+
{ Dst.elem<T>(I) = static_cast<T>(Elem); });
2865+
}
2866+
}
2867+
Dst.initializeAllElements();
2868+
2869+
return true;
2870+
}
2871+
28342872
static bool interp__builtin_elementwise_triop(
28352873
InterpState &S, CodePtr OpPC, const CallExpr *Call,
28362874
llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &)>
@@ -3496,6 +3534,16 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
34963534
return llvm::APIntOps::fshr(Hi, Lo, Amt);
34973535
});
34983536

3537+
case clang::X86::BI__builtin_ia32_blendpd:
3538+
case clang::X86::BI__builtin_ia32_blendpd256:
3539+
case clang::X86::BI__builtin_ia32_blendps:
3540+
case clang::X86::BI__builtin_ia32_blendps256:
3541+
case clang::X86::BI__builtin_ia32_pblendw128:
3542+
case clang::X86::BI__builtin_ia32_pblendw256:
3543+
case clang::X86::BI__builtin_ia32_pblendd128:
3544+
case clang::X86::BI__builtin_ia32_pblendd256:
3545+
return interp__builtin_blend(S, OpPC, Call, BuiltinID);
3546+
34993547
case clang::X86::BI__builtin_ia32_blendvpd:
35003548
case clang::X86::BI__builtin_ia32_blendvpd256:
35013549
case clang::X86::BI__builtin_ia32_blendvps:

clang/lib/AST/ExprConstant.cpp

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11945,6 +11945,40 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1194511945

1194611946
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
1194711947
}
11948+
case X86::BI__builtin_ia32_blendpd:
11949+
case X86::BI__builtin_ia32_blendpd256:
11950+
case X86::BI__builtin_ia32_blendps:
11951+
case X86::BI__builtin_ia32_blendps256:
11952+
case X86::BI__builtin_ia32_pblendw128:
11953+
case X86::BI__builtin_ia32_pblendw256:
11954+
case X86::BI__builtin_ia32_pblendd128:
11955+
case X86::BI__builtin_ia32_pblendd256: {
11956+
APValue SourceF, SourceT, SourceC;
11957+
if (!EvaluateAsRValue(Info, E->getArg(0), SourceF) ||
11958+
!EvaluateAsRValue(Info, E->getArg(1), SourceT) ||
11959+
!EvaluateAsRValue(Info, E->getArg(2), SourceC))
11960+
return false;
11961+
11962+
assert(SourceF.getKind() == clang::APValue::Vector);
11963+
assert(SourceT.getKind() == clang::APValue::Vector);
11964+
assert(SourceC.getKind() == clang::APValue::Int);
11965+
11966+
const APInt &C = SourceC.getInt();
11967+
auto SourceLen = SourceF.getVectorLength();
11968+
SmallVector<APValue, 32> ResultElements;
11969+
ResultElements.reserve(SourceLen);
11970+
auto BitIndex = E->getBuiltinCallee() == X86::BI__builtin_ia32_pblendw256
11971+
? [](unsigned I) { return I % 8; }
11972+
: [](unsigned I) { return I; };
11973+
for (unsigned EltNum = 0; EltNum != SourceLen; ++EltNum) {
11974+
const APValue &F = SourceF.getVectorElt(EltNum);
11975+
const APValue &T = SourceT.getVectorElt(EltNum);
11976+
ResultElements.push_back(C[BitIndex(EltNum)] ? T : F);
11977+
}
11978+
11979+
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
11980+
}
11981+
1194811982
case X86::BI__builtin_ia32_blendvpd:
1194911983
case X86::BI__builtin_ia32_blendvpd256:
1195011984
case X86::BI__builtin_ia32_blendvps:

clang/test/CodeGen/X86/avx-builtins.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,20 @@ __m256d test_mm256_blend_pd(__m256d A, __m256d B) {
8787
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
8888
return _mm256_blend_pd(A, B, 0x05);
8989
}
90+
TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x00), 1.0, 2.0, 3.0, 4.0));
91+
TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x05), 5.0, 2.0, 7.0, 4.0));
92+
TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x0A), 1.0, 6.0, 3.0, 8.0));
93+
TEST_CONSTEXPR(match_m256d(_mm256_blend_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){5.0, 6.0, 7.0, 8.0}), 0x0F), 5.0, 6.0, 7.0, 8.0));
9094

9195
__m256 test_mm256_blend_ps(__m256 A, __m256 B) {
9296
// CHECK-LABEL: test_mm256_blend_ps
9397
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
9498
return _mm256_blend_ps(A, B, 0x35);
9599
}
100+
TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0x00), 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f));
101+
TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0x35), -1.0f, 2.0f, -3.0f, 4.0f, -5.0f, -6.0f, 7.0f, 8.0f));
102+
TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0xAA), 1.0f, -2.0f, 3.0f, -4.0f, 5.0f, -6.0f, 7.0f, -8.0f));
103+
TEST_CONSTEXPR(match_m256(_mm256_blend_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256){-1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f}), 0xFF), -1.0f, -2.0f, -3.0f, -4.0f, -5.0f, -6.0f, -7.0f, -8.0f));
96104

97105
__m256d test_mm256_blendv_pd(__m256d V1, __m256d V2, __m256d V3) {
98106
// CHECK-LABEL: test_mm256_blendv_pd

clang/test/CodeGen/X86/avx2-builtins.c

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,20 +146,32 @@ __m256i test_mm256_blend_epi16(__m256i a, __m256i b) {
146146
// CHECK: shufflevector <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
147147
return _mm256_blend_epi16(a, b, 2);
148148
}
149+
TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x00), 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16));
150+
TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x5A), 1,-2,3,-4,-5,6,-7,8,9,-10,11,-12,-13,14,-15,16));
151+
TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0x94), 1,2,-3,4,-5,6,7,-8,9,10,-11,12,-13,14,15,-16));
152+
TEST_CONSTEXPR(match_v16hi(_mm256_blend_epi16(((__m256i)(__v16hi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}), ((__m256i)(__v16hi){-1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16}), 0xFF), -1,-2,-3,-4,-5,-6,-7,-8,-9,-10,-11,-12,-13,-14,-15,-16));
149153

150154
__m128i test_mm_blend_epi32(__m128i a, __m128i b) {
151155
// CHECK-LABEL: test_mm_blend_epi32
152156
// CHECK-NOT: @llvm.x86.avx2.pblendd.128
153157
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
154158
return _mm_blend_epi32(a, b, 0x05);
155159
}
160+
TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0x0), 1,2,3,4));
161+
TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0x5), -1,2,-3,4));
162+
TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0xA), 1,-2,3,-4));
163+
TEST_CONSTEXPR(match_v4si(_mm_blend_epi32(((__m128i)(__v4si){1,2,3,4}), ((__m128i)(__v4si){-1,-2,-3,-4}), 0xF), -1,-2,-3,-4));
156164

157165
__m256i test_mm256_blend_epi32(__m256i a, __m256i b) {
158166
// CHECK-LABEL: test_mm256_blend_epi32
159167
// CHECK-NOT: @llvm.x86.avx2.pblendd.256
160168
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 13, i32 6, i32 7>
161169
return _mm256_blend_epi32(a, b, 0x35);
162170
}
171+
TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0x00), 1,2,3,4,5,6,7,8));
172+
TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0xA5), -1,2,-3,4,5,-6,7,-8));
173+
TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0x94), 1,2,-3,4,-5,6,7,-8));
174+
TEST_CONSTEXPR(match_v8si(_mm256_blend_epi32(((__m256i)(__v8si){1,2,3,4,5,6,7,8}), ((__m256i)(__v8si){-1,-2,-3,-4,-5,-6,-7,-8}), 0xFF), -1,-2,-3,-4,-5,-6,-7,-8));
163175

164176
__m256i test_mm256_blendv_epi8(__m256i a, __m256i b, __m256i m) {
165177
// CHECK-LABEL: test_mm256_blendv_epi8

clang/test/CodeGen/X86/sse41-builtins.c

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,30 @@ __m128i test_mm_blend_epi16(__m128i V1, __m128i V2) {
2727
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 7>
2828
return _mm_blend_epi16(V1, V2, 42);
2929
}
30+
TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x00),1,2,3,4,5,6,7,8));
31+
TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x5A),1,-2,3,-4,-5,6,-7,8));
32+
TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0x94),1,2,-3,4,-5,6,7,-8));
33+
TEST_CONSTEXPR(match_v8hi(_mm_blend_epi16(((__m128i)(__v8hi){1,2,3,4,5,6,7,8}),((__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,-7,-8}),0xFF),-1,-2,-3,-4,-5,-6,-7,-8));
3034

3135
__m128d test_mm_blend_pd(__m128d V1, __m128d V2) {
3236
// CHECK-LABEL: test_mm_blend_pd
3337
// CHECK: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 0, i32 3>
3438
return _mm_blend_pd(V1, V2, 2);
3539
}
40+
TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 0), 1.0, 2.0));
41+
TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 1), 3.0, 2.0));
42+
TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 2), 1.0, 4.0));
43+
TEST_CONSTEXPR(match_m128d(_mm_blend_pd(((__m128d){1.0, 2.0}), ((__m128d){3.0, 4.0}), 3), 3.0, 4.0));
3644

3745
__m128 test_mm_blend_ps(__m128 V1, __m128 V2) {
3846
// CHECK-LABEL: test_mm_blend_ps
3947
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
4048
return _mm_blend_ps(V1, V2, 6);
4149
}
50+
TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0x0), 1.0f, 2.0f, 3.0f, 4.0f));
51+
TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0x5), 5.0f, 2.0f, 7.0f, 4.0f));
52+
TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0xA), 1.0f, 6.0f, 3.0f, 8.0f));
53+
TEST_CONSTEXPR(match_m128(_mm_blend_ps(((__m128){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128){5.0f, 6.0f, 7.0f, 8.0f}), 0xF), 5.0f, 6.0f, 7.0f, 8.0f));
4254

4355
__m128i test_mm_blendv_epi8(__m128i V1, __m128i V2, __m128i V3) {
4456
// CHECK-LABEL: test_mm_blendv_epi8
@@ -459,4 +471,3 @@ int test_mm_testz_si128(__m128i x, __m128i y) {
459471
// CHECK: call {{.*}}i32 @llvm.x86.sse41.ptestz(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
460472
return _mm_testz_si128(x, y);
461473
}
462-

0 commit comments

Comments
 (0)