From bdfc0fe8df0bb34cb9dfcffe90dd5d827ab93766 Mon Sep 17 00:00:00 2001 From: Moritz Zielke Date: Mon, 17 Nov 2025 13:33:10 +0100 Subject: [PATCH 1/3] [Clang] Allow AVX/AVX512 subvector shuffles in constexpr --- clang/include/clang/Basic/BuiltinsX86.td | 6 ++- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 33 ++++++++++++++ clang/lib/AST/ExprConstant.cpp | 51 ++++++++++++++++++++++ clang/test/CodeGen/X86/avx512f-builtins.c | 5 +++ clang/test/CodeGen/X86/avx512vl-builtins.c | 4 ++ 5 files changed, 97 insertions(+), 2 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 7a14c6ec21a1a..272704c8451a7 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -2464,7 +2464,8 @@ let Features = "avx512vl", def pternlogq256_maskz : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Vector<4, long long int>, _Constant int, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def shuf_f32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">; def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">; def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">; @@ -2476,7 +2477,8 @@ let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVecto def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", + Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def shuf_f32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; def shuf_f64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">; def shuf_i32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>, _Constant int)">; diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 5a96320e12b6f..8e0d254bdddc9 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4718,6 +4718,39 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return interp__builtin_elementwise_triop(S, OpPC, Call, llvm::APIntOps::fshr); + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i64x2: { + // Destination and sources A, B all have the same type. + QualType VecQT = Call->getArg(0)->getType(); + const auto *VecT = VecQT->castAs(); + unsigned NumElems = VecT->getNumElements(); + unsigned ElemBits = S.getASTContext().getTypeSize(VecT->getElementType()); + constexpr unsigned LaneBits = 128u; + unsigned NumLanes = (NumElems * ElemBits) / LaneBits; + unsigned NumElemsPerLane = LaneBits / ElemBits; + + return interp__builtin_ia32_shuffle_generic( + S, OpPC, Call, + [NumLanes, NumElemsPerLane](unsigned DstIdx, unsigned ShuffleMask) { + // DstIdx determines source. ShuffleMask selects lane in source. + unsigned BitsPerElem = NumLanes / 2; + unsigned IndexMask = (1u << BitsPerElem) - 1; + unsigned Lane = DstIdx / NumElemsPerLane; + unsigned SrcIdx = (Lane < NumLanes / 2) ? 0 : 1; + unsigned BitIdx = BitsPerElem * Lane; + unsigned SrcLaneIdx = (ShuffleMask >> BitIdx) & IndexMask; + unsigned ElemInLane = DstIdx % NumElemsPerLane; + unsigned IdxToPick = SrcLaneIdx * NumElemsPerLane + ElemInLane; + return std::pair{SrcIdx, IdxToPick}; + }); + } + case X86::BI__builtin_ia32_insertf32x4_256: case X86::BI__builtin_ia32_inserti32x4_256: case X86::BI__builtin_ia32_insertf64x2_256: diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 74f6e3acb6b39..999669f7539cd 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -50,6 +50,7 @@ #include "clang/AST/RecordLayout.h" #include "clang/AST/StmtVisitor.h" #include "clang/AST/Type.h" +#include "clang/AST/TypeBase.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticSema.h" @@ -13461,6 +13462,56 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + case X86::BI__builtin_ia32_shuf_f32x4_256: + case X86::BI__builtin_ia32_shuf_i32x4_256: + case X86::BI__builtin_ia32_shuf_f64x2_256: + case X86::BI__builtin_ia32_shuf_i64x2_256: + case X86::BI__builtin_ia32_shuf_f32x4: + case X86::BI__builtin_ia32_shuf_i32x4: + case X86::BI__builtin_ia32_shuf_f64x2: + case X86::BI__builtin_ia32_shuf_i64x2: { + APValue SourceA, SourceB; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceA) || + !EvaluateAsRValue(Info, E->getArg(1), SourceB)) + return false; + + APSInt Imm; + if (!EvaluateInteger(E->getArg(2), Imm, Info)) + return false; + + // Destination and sources A, B all have the same type. + unsigned NumElems = SourceA.getVectorLength(); + const VectorType *VT = E->getArg(0)->getType()->castAs(); + QualType ElemQT = VT->getElementType(); + unsigned ElemBits = Info.Ctx.getTypeSize(ElemQT); + constexpr unsigned LaneBits = 128u; + unsigned NumLanes = (NumElems * ElemBits) / LaneBits; + unsigned NumElemsPerLane = LaneBits / ElemBits; + + unsigned DstLen = SourceA.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(DstLen); + + APValue R; + if (!evalShuffleGeneric( + Info, E, R, + [NumLanes, NumElemsPerLane](unsigned DstIdx, unsigned ShuffleMask) + -> std::pair { + // DstIdx determines source. ShuffleMask selects lane in source. + unsigned BitsPerElem = NumLanes / 2; + unsigned IndexMask = (1u << BitsPerElem) - 1; + unsigned Lane = DstIdx / NumElemsPerLane; + unsigned SrcIdx = (Lane < NumLanes / 2) ? 0 : 1; + unsigned BitIdx = BitsPerElem * Lane; + unsigned SrcLaneIdx = (ShuffleMask >> BitIdx) & IndexMask; + unsigned ElemInLane = DstIdx % NumElemsPerLane; + unsigned IdxToPick = SrcLaneIdx * NumElemsPerLane + ElemInLane; + return {SrcIdx, IdxToPick}; + })) + return false; + return Success(R, E); + } + case X86::BI__builtin_ia32_insertf32x4_256: case X86::BI__builtin_ia32_inserti32x4_256: case X86::BI__builtin_ia32_insertf64x2_256: diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index eb25aa538e9a3..4881700bb5a66 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -6747,6 +6747,7 @@ __m512 test_mm512_shuffle_f32x4(__m512 __A, __m512 __B) { // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> return _mm512_shuffle_f32x4(__A, __B, 4); } +TEST_CONSTEXPR(match_m512(_mm512_shuffle_f32x4(((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0}), 0b11111111), 13.0f, 14.0f, 15.0f, 16.0f, 13.0f, 14.0f, 15.0f, 16.0f, 130.0, 140.0, 150.0, 160.0, 130.0, 140.0, 150.0, 160.0)); __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_mask_shuffle_f32x4 @@ -6767,6 +6768,7 @@ __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) { // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> %{{.*}}, <8 x i32> return _mm512_shuffle_f64x2(__A, __B, 4); } +TEST_CONSTEXPR(match_m512d(_mm512_shuffle_f64x2(((__m512d){1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0}), ((__m512d){10.0,20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0b10101100), 1.0, 2.0, 7.0, 8.0, 50.0, 60.0, 50.0, 60.0)); __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_mask_shuffle_f64x2 @@ -6787,6 +6789,8 @@ __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> return _mm512_shuffle_i32x4(__A, __B, 4); } +TEST_CONSTEXPR(match_v16si(_mm512_shuffle_i32x4(((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m512i)(__v16si){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 0), 1, 2, 3, 4, 1, 2, 3, 4, 10, 20, 30, 40, 10, 20, 30, 40)); + __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_i32x4 @@ -6807,6 +6811,7 @@ __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) { // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> return _mm512_shuffle_i64x2(__A, __B, 4); } +TEST_CONSTEXPR(match_m512i(_mm512_shuffle_i64x2(((__m512i){1, 2, 3, 4, 5, 6, 7, 8}), ((__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0b11000110), 5, 6, 3, 4, 10, 20, 70, 80)); __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_mask_shuffle_i64x2 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index e05b1ddf7b69a..aff775c7f7b3c 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -8989,6 +8989,7 @@ __m256 test_mm256_shuffle_f32x4(__m256 __A, __m256 __B) { // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> return _mm256_shuffle_f32x4(__A, __B, 3); } +TEST_CONSTEXPR(match_m256(_mm256_shuffle_f32x4(((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m256){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 1), 5.0, 6.0, 7.0, 8.0, 10.0, 20.0, 30.0, 40.0)); __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_mask_shuffle_f32x4 @@ -9009,6 +9010,7 @@ __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> return _mm256_shuffle_f64x2(__A, __B, 3); } +TEST_CONSTEXPR(match_m256d(_mm256_shuffle_f64x2(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){10.0, 20.0, 30.0, 40.0}), 3), 3.0, 4.0, 30.0, 40.0)); __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_mask_shuffle_f64x2 @@ -9031,6 +9033,7 @@ __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> return _mm256_shuffle_i32x4(__A, __B, 3); } +TEST_CONSTEXPR(match_v8si(_mm256_shuffle_i32x4(((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), ((__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}), 0), 1, 2, 3, 4, 10, 20, 30, 40)); __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_shuffle_i32x4 @@ -9051,6 +9054,7 @@ __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> return _mm256_shuffle_i64x2(__A, __B, 3); } +TEST_CONSTEXPR(match_m256i(_mm256_shuffle_i64x2(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m256i){10ULL, 20ULL, 30ULL, 40ULL}), 2), 1ULL, 2ULL, 30ULL, 40ULL)); __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_mask_shuffle_i64x2 From 6dcf491b83a5ac6531f062dd31582c6f79e6ce58 Mon Sep 17 00:00:00 2001 From: Moritz Zielke Date: Tue, 25 Nov 2025 19:06:55 +0100 Subject: [PATCH 2/3] Add tests for mask/maskz intrinsics --- clang/test/CodeGen/X86/avx512f-builtins.c | 8 ++++++++ clang/test/CodeGen/X86/avx512vl-builtins.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index cf4bf2383eae8..753b4e48c20ed 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -6900,6 +6900,7 @@ __m512 test_mm512_mask_shuffle_f32x4(__m512 __W, __mmask16 __U, __m512 __A, __m5 // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_shuffle_f32x4(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512(_mm512_mask_shuffle_f32x4(((__m512){100.0f, 200.0f, 300.0f, 400.0f, 500.0f, 600.0f, 700.0f, 800.0f, 900.0f, 1000.0f, 1100.0f, 1200.0f, 1300.0f, 1400.0f, 1500.0f, 1600.0f}), 0b1111111111111110, ((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0}), 0b11111111), 100.0f, 14.0f, 15.0f, 16.0f, 13.0f, 14.0f, 15.0f, 16.0f, 130.0, 140.0, 150.0, 160.0, 130.0, 140.0, 150.0, 160.0)); __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_f32x4 @@ -6907,6 +6908,7 @@ __m512 test_mm512_maskz_shuffle_f32x4(__mmask16 __U, __m512 __A, __m512 __B) { // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_maskz_shuffle_f32x4(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512(_mm512_maskz_shuffle_f32x4(0b1111111111110111, ((__m512){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0, 130.0, 140.0, 150.0, 160.0}), 0b11111111), 13.0f, 14.0f, 15.0f, 0.0f, 13.0f, 14.0f, 15.0f, 16.0f, 130.0, 140.0, 150.0, 160.0, 130.0, 140.0, 150.0, 160.0)); __m512d test_mm512_shuffle_f64x2(__m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_shuffle_f64x2 @@ -6921,6 +6923,7 @@ __m512d test_mm512_mask_shuffle_f64x2(__m512d __W, __mmask8 __U, __m512d __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_shuffle_f64x2(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512d(_mm512_mask_shuffle_f64x2(((__m512d){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 0b11110000, ((__m512d){1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0}), ((__m512d){10.0,20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0b10101100), 100.0, 200.0, 300.0, 400.0, 50.0, 60.0, 50.0, 60.0)); __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_f64x2 @@ -6928,6 +6931,7 @@ __m512d test_mm512_maskz_shuffle_f64x2(__mmask8 __U, __m512d __A, __m512d __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_maskz_shuffle_f64x2(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512d(_mm512_maskz_shuffle_f64x2(0b11110100, ((__m512d){1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0}), ((__m512d){10.0,20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 0b10101100), 0.0, 0.0, 7.0, 0.0, 50.0, 60.0, 50.0, 60.0)); __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_shuffle_i32x4 @@ -6943,6 +6947,7 @@ __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, _ // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_v16si(_mm512_mask_shuffle_i32x4(((__m512i)(__v16si){100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300, 1400, 1500, 1600}), 0b1111111111111011, ((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m512i)(__v16si){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 0), 1, 2, 300, 4, 1, 2, 3, 4, 10, 20, 30, 40, 10, 20, 30, 40)); __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_i32x4 @@ -6950,6 +6955,7 @@ __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}} return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_v16si(_mm512_maskz_shuffle_i32x4(0b1011111111111111, ((__m512i)(__v16si){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), ((__m512i)(__v16si){10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160}), 0), 1, 2, 3, 4, 1, 2, 3, 4, 10, 20, 30, 40, 10, 20, 0, 40)); __m512i test_mm512_shuffle_i64x2(__m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_shuffle_i64x2 @@ -6964,6 +6970,7 @@ __m512i test_mm512_mask_shuffle_i64x2(__m512i __W, __mmask8 __U, __m512i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_mask_shuffle_i64x2(__W, __U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512i(_mm512_mask_shuffle_i64x2(((__m512i){100, 200, 300, 400, 500, 600, 700, 800}), 0b11111101, ((__m512i){1, 2, 3, 4, 5, 6, 7, 8}), ((__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0b11000110), 5, 200, 3, 4, 10, 20, 70, 80)); __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: test_mm512_maskz_shuffle_i64x2 @@ -6971,6 +6978,7 @@ __m512i test_mm512_maskz_shuffle_i64x2(__mmask8 __U, __m512i __A, __m512i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}} return _mm512_maskz_shuffle_i64x2(__U, __A, __B, 4); } +TEST_CONSTEXPR(match_m512i(_mm512_maskz_shuffle_i64x2(0b00111101, ((__m512i){1, 2, 3, 4, 5, 6, 7, 8}), ((__m512i){10, 20, 30, 40, 50, 60, 70, 80}), 0b11000110), 5, 0, 3, 4, 10, 20, 0, 0)); __m512d test_mm512_shuffle_pd(__m512d __M, __m512d __V) { // CHECK-LABEL: test_mm512_shuffle_pd diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 0206851f73ef6..e1878bd84b1ad 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -9086,6 +9086,7 @@ __m256 test_mm256_mask_shuffle_f32x4(__m256 __W, __mmask8 __U, __m256 __A, __m25 // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_mask_shuffle_f32x4(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256(_mm256_mask_shuffle_f32x4(((__m256){100.0, 200.0, 300.0, 400.0, 500.0, 600.0, 700.0, 800.0}), 0b10101010, ((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m256){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 1), 100.0, 6.0, 300.0, 8.0, 500.0, 20.0, 700.0, 40.0)); __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_f32x4 @@ -9093,6 +9094,7 @@ __m256 test_mm256_maskz_shuffle_f32x4(__mmask8 __U, __m256 __A, __m256 __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm256_maskz_shuffle_f32x4(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256(_mm256_maskz_shuffle_f32x4(0b01010101, ((__m256){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m256){10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0}), 1), 5.0, 0.0, 7.0, 0.0, 10.0, 0.0, 30.0, 0.0)); __m256d test_mm256_shuffle_f64x2(__m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_shuffle_f64x2 @@ -9108,6 +9110,7 @@ __m256d test_mm256_mask_shuffle_f64x2(__m256d __W, __mmask8 __U, __m256d __A, __ // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_mask_shuffle_f64x2(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256d(_mm256_mask_shuffle_f64x2(((__m256d){100.0, 200.0, 300.0, 400.0}), 0b00001111, ((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){10.0, 20.0, 30.0, 40.0}), 3), 3.0, 4.0, 30.0, 40.0)); __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_f64x2 @@ -9116,6 +9119,7 @@ __m256d test_mm256_maskz_shuffle_f64x2(__mmask8 __U, __m256d __A, __m256d __B) { // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm256_maskz_shuffle_f64x2(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256d(_mm256_maskz_shuffle_f64x2(0b00001011, ((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m256d){10.0, 20.0, 30.0, 40.0}), 3), 3.0, 4.0, 0.0, 40.0)); __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_shuffle_i32x4 @@ -9130,6 +9134,7 @@ __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_v8si(_mm256_mask_shuffle_i32x4(((__m256i)(__v8si){100, 200, 300, 400, 500, 600, 700, 800}), 0b00000000, ((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), ((__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}), 0), 100, 200, 300, 400, 500, 600, 700, 800)); __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_i32x4 @@ -9137,6 +9142,7 @@ __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_v8si(_mm256_maskz_shuffle_i32x4(0b11111111, ((__m256i)(__v8si){1, 2, 3, 4, 5, 6, 7, 8}), ((__m256i)(__v8si){10, 20, 30, 40, 50, 60, 70, 80}), 0), 1, 2, 3, 4, 10, 20, 30, 40)); __m256i test_mm256_shuffle_i64x2(__m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_shuffle_i64x2 @@ -9152,6 +9158,7 @@ __m256i test_mm256_mask_shuffle_i64x2(__m256i __W, __mmask8 __U, __m256i __A, __ // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_mask_shuffle_i64x2(__W, __U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256i(_mm256_mask_shuffle_i64x2(((__m256i){100ULL, 200ULL, 300ULL, 400ULL}), 0b00001101, ((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m256i){10ULL, 20ULL, 30ULL, 40ULL}), 2), 1ULL, 200ULL, 30ULL, 40ULL)); __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: test_mm256_maskz_shuffle_i64x2 @@ -9160,6 +9167,7 @@ __m256i test_mm256_maskz_shuffle_i64x2(__mmask8 __U, __m256i __A, __m256i __B) { // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm256_maskz_shuffle_i64x2(__U, __A, __B, 3); } +TEST_CONSTEXPR(match_m256i(_mm256_maskz_shuffle_i64x2( 0b00000110, ((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m256i){10ULL, 20ULL, 30ULL, 40ULL}), 2), 0ULL, 2ULL, 30ULL, 0ULL)); __m128d test_mm_mask_shuffle_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { // CHECK-LABEL: test_mm_mask_shuffle_pd From e886f30aa76eacdee02b4f3c0109b3aa103392d6 Mon Sep 17 00:00:00 2001 From: Moritz Zielke Date: Wed, 26 Nov 2025 17:37:40 +0100 Subject: [PATCH 3/3] Address review comments --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 2 +- clang/lib/AST/ExprConstant.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 0ceb5a7290f2f..659b6985d3157 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -4860,7 +4860,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, const auto *VecT = VecQT->castAs(); unsigned NumElems = VecT->getNumElements(); unsigned ElemBits = S.getASTContext().getTypeSize(VecT->getElementType()); - constexpr unsigned LaneBits = 128u; + unsigned LaneBits = 128u; unsigned NumLanes = (NumElems * ElemBits) / LaneBits; unsigned NumElemsPerLane = LaneBits / ElemBits; diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 3c8886aaaf832..88d7c79d3b99e 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -50,7 +50,6 @@ #include "clang/AST/RecordLayout.h" #include "clang/AST/StmtVisitor.h" #include "clang/AST/Type.h" -#include "clang/AST/TypeBase.h" #include "clang/AST/TypeLoc.h" #include "clang/Basic/Builtins.h" #include "clang/Basic/DiagnosticSema.h" @@ -13540,7 +13539,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { const VectorType *VT = E->getArg(0)->getType()->castAs(); QualType ElemQT = VT->getElementType(); unsigned ElemBits = Info.Ctx.getTypeSize(ElemQT); - constexpr unsigned LaneBits = 128u; + unsigned LaneBits = 128u; unsigned NumLanes = (NumElems * ElemBits) / LaneBits; unsigned NumElemsPerLane = LaneBits / ElemBits;