diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 4aa3d51931980..fcc3957f9b8ab 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -2371,7 +2371,8 @@ let Features = "avx512vl", def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def shuf_f32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">; def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">; def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">; @@ -2391,7 +2392,8 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, int>)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def shuf_f32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; def shuf_f64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; def shuf_i32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 2ab40ac9cc89c..659b6985d3157 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4847,6 +4847,39 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_elementwise_triop(S, OpPC, Call, llvm::APIntOps::fshr); + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i64x2: { + // Destination and sources A, B all have the same type. + QualType VecQT = Call->getArg(0)->getType(); + const auto *VecT = VecQT->castAs(); + unsigned NumElems = VecT->getNumElements(); + unsigned ElemBits = S.getASTContext().getTypeSize(VecT->getElementType()); + unsigned LaneBits = 128u; + unsigned NumLanes = (NumElems * ElemBits) / LaneBits; + unsigned NumElemsPerLane = LaneBits / ElemBits; + + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, + [NumLanes, NumElemsPerLane](unsigned DstIdx, unsigned ShuffleMask) { + // DstIdx determines source. ShuffleMask selects lane in source. + unsigned BitsPerElem = NumLanes / 2; + unsigned IndexMask = (1u << BitsPerElem) - 1; + unsigned Lane = DstIdx / NumElemsPerLane; + unsigned SrcIdx = (Lane < NumLanes / 2) ? 0 : 1; + unsigned BitIdx = BitsPerElem * Lane; + unsigned SrcLaneIdx = (ShuffleMask >> BitIdx) & IndexMask; + unsigned ElemInLane = DstIdx % NumElemsPerLane; + unsigned IdxToPick = SrcLaneIdx * NumElemsPerLane + ElemInLane; + return std::pair{SrcIdx, IdxToPick}; + }); + } + case X86::BI__builtin_ia32_insertf32x4_256: case X86::BI__builtin_ia32_inserti32x4_256: case X86::BI__builtin_ia32_insertf64x2_256: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3b91678f7d400..88d7c79d3b99e 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -13517,6 +13517,56 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i64x2: { + APValue SourceA, SourceB; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) || + !EvaluateAsRValue(Info, E->getArg(1), SourceB)) + return false; + + APSInt Imm; + if (!EvaluateInteger(E->getArg(2), Imm, Info)) + return false; + + // Destination and sources A, B all have the same type. + unsigned NumElems = SourceA.getVectorLength(); + const VectorType *VT = E->getArg(0)->getType()->castAs(); + QualType ElemQT = VT->getElementType(); + unsigned ElemBits = Info.Ctx.getTypeSize(ElemQT); + unsigned LaneBits = 128u; + unsigned NumLanes = (NumElems * ElemBits) / LaneBits; + unsigned NumElemsPerLane = LaneBits / ElemBits; + + unsigned DstLen = SourceA.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(DstLen); + + APValue R; + if (!evalShuffleGeneric( + Info, E, R, + [NumLanes, NumElemsPerLane](unsigned DstIdx, unsigned ShuffleMask) + -> std::pair { + // DstIdx determines source. ShuffleMask selects lane in source. + unsigned BitsPerElem = NumLanes / 2; + unsigned IndexMask = (1u << BitsPerElem) - 1; + unsigned Lane = DstIdx / NumElemsPerLane; + unsigned SrcIdx = (Lane < NumLanes / 2) ? 0 : 1; + unsigned BitIdx = BitsPerElem * Lane; + unsigned SrcLaneIdx = (ShuffleMask >> BitIdx) & IndexMask; + unsigned ElemInLane = DstIdx % NumElemsPerLane; + unsigned IdxToPick = SrcLaneIdx * NumElemsPerLane + ElemInLane; + return {SrcIdx, IdxToPick}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_insertf32x4_256: case X86::BI__builtin_ia32_inserti32x4_256: case X86::BI__builtin_ia32_insertf64x2_256: diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index a148940033642..753b4e48c20ed 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -6892,6 +6892,7 @@ __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) { // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> return _mm512_shuffle_f32x4(__A, __B, 4); } +TEST_CONSTEXPR(match_m512(_mm512_shuffle_f32x4(((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0}), 0b11111111), 13.0f, 14.0f, 15.0f, 16.0f, 13.0f, 14.0f, 15.0f, 16.0f, 130.0, 140.0, 150.0, 160.0, 130.0, 140.0, 150.0, 160.0)); __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_mask_shuffle_f32x4 @@ -6899,6 +6900,7 @@ __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m5 // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512(_mm512_mask_shuffle_f32x4(((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}), 0b1111111111111110, ((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0}), 0b11111111), 100.0f, 14.0f, 15.0f, 16.0f, 13.0f, 14.0f, 15.0f, 16.0f, 130.0, 140.0, 150.0, 160.0, 130.0, 140.0, 150.0, 160.0)); __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_f32x4 @@ -6906,12 +6908,14 @@ __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_shuffle_f32x4(0b1111111111110111, ((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0}), 0b11111111), 13.0f, 14.0f, 15.0f, 0.0f, 13.0f, 14.0f, 15.0f, 16.0f, 130.0, 140.0, 150.0, 160.0, 130.0, 140.0, 150.0, 160.0)); __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_shuffle_f64x2 // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> return _mm512_shuffle_f64x2(__A, __B, 4); } +TEST_CONSTEXPR(match_m512d(_mm512_shuffle_f64x2(((__m512d){1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0}), ((__m512d){10.0,20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0b10101100), 1.0, 2.0, 7.0, 8.0, 50.0, 60.0, 50.0, 60.0)); __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_mask_shuffle_f64x2 @@ -6919,6 +6923,7 @@ __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_shuffle_f64x2(((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 0b11110000, ((__m512d){1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0}), ((__m512d){10.0,20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0b10101100), 100.0, 200.0, 300.0, 400.0, 50.0, 60.0, 50.0, 60.0)); __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_f64x2 @@ -6926,12 +6931,15 @@ __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_shuffle_f64x2(0b11110100, ((__m512d){1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0}), ((__m512d){10.0,20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0b10101100), 0.0, 0.0, 7.0, 0.0, 50.0, 60.0, 50.0, 60.0)); __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_shuffle_i32x4 // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> return _mm512_shuffle_i32x4(__A, __B, 4); } +TEST_CONSTEXPR(match_v16si(_mm512_shuffle_i32x4(((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m512i)(__v16si){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 0), 1, 2, 3, 4, 1, 2, 3, 4, 10, 20, 30, 40, 10, 20, 30, 40)); + __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_i32x4 @@ -6939,6 +6947,7 @@ __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, _ // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_i32x4(((__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}), 0b1111111111111011, ((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m512i)(__v16si){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 0), 1, 2, 300, 4, 1, 2, 3, 4, 10, 20, 30, 40, 10, 20, 30, 40)); __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_i32x4 @@ -6946,12 +6955,14 @@ __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_i32x4(0b1011111111111111, ((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m512i)(__v16si){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 0), 1, 2, 3, 4, 1, 2, 3, 4, 10, 20, 30, 40, 10, 20, 0, 40)); __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_shuffle_i64x2 // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> return _mm512_shuffle_i64x2(__A, __B, 4); } +TEST_CONSTEXPR(match_m512i(_mm512_shuffle_i64x2(((__m512i){1, 2, 3, 4, 5, 6, 7, 8}), ((__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0b11000110), 5, 6, 3, 4, 10, 20, 70, 80)); __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_i64x2 @@ -6959,6 +6970,7 @@ __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512i(_mm512_mask_shuffle_i64x2(((__m512i){100, 200, 300, 400, 500, 600, 700, 800}), 0b11111101, ((__m512i){1, 2, 3, 4, 5, 6, 7, 8}), ((__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0b11000110), 5, 200, 3, 4, 10, 20, 70, 80)); __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_i64x2 @@ -6966,6 +6978,7 @@ __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_shuffle_i64x2(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512i(_mm512_maskz_shuffle_i64x2(0b00111101, ((__m512i){1, 2, 3, 4, 5, 6, 7, 8}), ((__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0b11000110), 5, 0, 3, 4, 10, 20, 0, 0)); __m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) { // CHECK-LABEL: test_mm512_shuffle_pd diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 58bb8bef6fb46..e1878bd84b1ad 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -9078,6 +9078,7 @@ __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) { // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_shuffle_f32x4(__A, __B, 3); } +TEST_CONSTEXPR(match_m256(_mm256_shuffle_f32x4(((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m256){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 1), 5.0, 6.0, 7.0, 8.0, 10.0, 20.0, 30.0, 40.0)); __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_shuffle_f32x4 @@ -9085,6 +9086,7 @@ __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m25 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256(_mm256_mask_shuffle_f32x4(((__m256){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 0b10101010, ((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m256){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 1), 100.0, 6.0, 300.0, 8.0, 500.0, 20.0, 700.0, 40.0)); __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_f32x4 @@ -9092,12 +9094,14 @@ __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_shuffle_f32x4(0b01010101, ((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m256){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 1), 5.0, 0.0, 7.0, 0.0, 10.0, 0.0, 30.0, 0.0)); __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_shuffle_f64x2 // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_shuffle_f64x2(__A, __B, 3); } +TEST_CONSTEXPR(match_m256d(_mm256_shuffle_f64x2(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){10.0, 20.0, 30.0, 40.0}), 3), 3.0, 4.0, 30.0, 40.0)); __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_shuffle_f64x2 @@ -9106,6 +9110,7 @@ __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __ // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_shuffle_f64x2(((__m256d){100.0, 200.0, 300.0, 400.0}), 0b00001111, ((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){10.0, 20.0, 30.0, 40.0}), 3), 3.0, 4.0, 30.0, 40.0)); __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_f64x2 @@ -9114,12 +9119,14 @@ __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_shuffle_f64x2(0b00001011, ((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){10.0, 20.0, 30.0, 40.0}), 3), 3.0, 4.0, 0.0, 40.0)); __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_shuffle_i32x4 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> return _mm256_shuffle_i32x4(__A, __B, 3); } +TEST_CONSTEXPR(match_v8si(_mm256_shuffle_i32x4(((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), ((__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}), 0), 1, 2, 3, 4, 10, 20, 30, 40)); __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_shuffle_i32x4 @@ -9127,6 +9134,7 @@ __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_i32x4(((__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800}), 0b00000000, ((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), ((__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}), 0), 100, 200, 300, 400, 500, 600, 700, 800)); __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_i32x4 @@ -9134,12 +9142,14 @@ __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_i32x4(0b11111111, ((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), ((__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}), 0), 1, 2, 3, 4, 10, 20, 30, 40)); __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_shuffle_i64x2 // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i64x2(__A, __B, 3); } +TEST_CONSTEXPR(match_m256i(_mm256_shuffle_i64x2(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m256i){10ULL, 20ULL, 30ULL, 40ULL}), 2), 1ULL, 2ULL, 30ULL, 40ULL)); __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_shuffle_i64x2 @@ -9148,6 +9158,7 @@ __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __ // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256i(_mm256_mask_shuffle_i64x2(((__m256i){100ULL, 200ULL, 300ULL, 400ULL}), 0b00001101, ((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m256i){10ULL, 20ULL, 30ULL, 40ULL}), 2), 1ULL, 200ULL, 30ULL, 40ULL)); __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_i64x2 @@ -9156,6 +9167,7 @@ __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_shuffle_i64x2(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256i(_mm256_maskz_shuffle_i64x2( 0b00000110, ((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m256i){10ULL, 20ULL, 30ULL, 40ULL}), 2), 0ULL, 2ULL, 30ULL, 0ULL)); __m128d test_mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_mask_shuffle_pd