From 53667e100dd379001ee56eeff2da5b127fb07535 Mon Sep 17 00:00:00 2001 From: seongjaep Date: Fri, 12 Sep 2025 14:18:41 +0900 Subject: [PATCH 01/21] [WIP][Clang][ConstExpr] Add initial support for AVX 256->128 extract builtins --- clang/lib/AST/ExprConstant.cpp | 31 +++++++++++++++++++ .../test/SemaCXX/constexpr-avx-intrinsics.cpp | 25 +++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 clang/test/SemaCXX/constexpr-avx-intrinsics.cpp diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 35a866ea5010f..4674381c34018 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12027,6 +12027,37 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + + case X86::BI__builtin_ia32_vextracti128_si256: + case X86::BI__builtin_ia32_vextractf128_pd: + case X86::BI__builtin_ia32_vextractf128_ps: + case X86::BI__builtin_ia32_vextractf128_si256: { + APValue SourceHi, SourceLo, SourceAmt; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceHi) || + !EvaluateAsRValue(Info, E->getArg(1), SourceLo) || + !EvaluateAsRValue(Info, E->getArg(2), SourceAmt)) + return false; + + QualType DestEltTy = E->getType()->castAs()->getElementType(); + unsigned SourceLen = SourceHi.getVectorLength(); + SmallVector ResultElements; + ResultElements.reserve(SourceLen); + + APInt Amt = SourceAmt.getInt(); + for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) { + APInt Hi = SourceHi.getVectorElt(EltNum).getInt(); + APInt Lo = SourceLo.getVectorElt(EltNum).getInt(); + APInt R = llvm::APIntOps::fshl(Hi, Lo, Amt); + ResultElements.push_back( + APValue(APSInt(R, DestEltTy->isUnsignedIntegerOrEnumerationType()))); + } + + return Success(APValue(ResultElements.data(), ResultElements.size()), E); + } + + + + case X86::BI__builtin_ia32_vpshldd128: case X86::BI__builtin_ia32_vpshldd256: case X86::BI__builtin_ia32_vpshldd512: diff --git a/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp b/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp new file mode 100644 index 0000000000000..30e1340601255 --- /dev/null +++ b/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp @@ -0,0 +1,25 @@ +// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s +// expected-no-diagnostics + +#include // AVX/AVX512 헤더 + +// // 테스트하려는 AVX/AVX512 내장 함수를 사용하는 constexpr 함수 +// constexpr int test_avx_subvector_extraction() { +// __m256i a = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + +// // 이슈의 핵심: 이 내장 함수 호출이 constexpr 문맥에서 가능해야 함 +// __m128i sub = _mm256_extracti128_si256(a, 0); + +// return _mm_cvtsi128_si32(sub); // 결과를 int로 변환하여 리턴 +// } + +// // 이 상수는 컴파일 시간에 평가되어야 함 +// constexpr int result = test_avx_subvector_extraction(); + +// static_assert(result == 0, "Incorrect result"); + +#include + +constexpr __m128 test(__m256 a) { + return _mm256_extractf128_ps(a, 1); +} \ No newline at end of file From 46458a47192c7ef899336a6a175276b644ac34f8 Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Fri, 12 Sep 2025 21:00:28 +0900 Subject: [PATCH 02/21] [clang] Support constexpr evaluation for AVX/AVX2 extract intrinsics Implements constexpr evaluation for: - _mm256_extracti128_si256 (AVX2, VEXTRACTI128) - _mm256_extractf128_ps - _mm256_extractf128_pd - _mm256_extractf128_si256 These now work correctly in constant expressions by extracting the appropriate 128-bit lane from a 256-bit vector. --- clang/lib/AST/ExprConstant.cpp | 43 +++++++++++++++------------------- 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 4674381c34018..16567e56cc778 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12028,35 +12028,30 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } - case X86::BI__builtin_ia32_vextracti128_si256: - case X86::BI__builtin_ia32_vextractf128_pd: - case X86::BI__builtin_ia32_vextractf128_ps: + case X86::BI__builtin_ia32_extract128i256: + case X86::BI__builtin_ia32_vextractf128_pd256: + case X86::BI__builtin_ia32_vextractf128_ps256: case X86::BI__builtin_ia32_vextractf128_si256: { - APValue SourceHi, SourceLo, SourceAmt; - if (!EvaluateAsRValue(Info, E->getArg(0), SourceHi) || - !EvaluateAsRValue(Info, E->getArg(1), SourceLo) || - !EvaluateAsRValue(Info, E->getArg(2), SourceAmt)) + APValue SourceVec, SourceImm; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || + !EvaluateAsRValue(Info, E->getArg(1), SourceImm)) return false; - QualType DestEltTy = E->getType()->castAs()->getElementType(); - unsigned SourceLen = SourceHi.getVectorLength(); - SmallVector ResultElements; - ResultElements.reserve(SourceLen); - - APInt Amt = SourceAmt.getInt(); - for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) { - APInt Hi = SourceHi.getVectorElt(EltNum).getInt(); - APInt Lo = SourceLo.getVectorElt(EltNum).getInt(); - APInt R = llvm::APIntOps::fshl(Hi, Lo, Amt); - ResultElements.push_back( - APValue(APSInt(R, DestEltTy->isUnsignedIntegerOrEnumerationType()))); - } + unsigned idx = SourceImm.getInt().getZExtValue() & 1; + const auto *RetVT = E->getType()->castAs(); + unsigned RetLen = RetVT->getNumElements(); + unsigned SrcLen = SourceVec.getVectorLength(); + if (SrcLen != RetLen * 2) + return false; + + SmallVector ResultElements; + ResultElements.reserve(RetLen); - return Success(APValue(ResultElements.data(), ResultElements.size()), E); + for (unsigned i = 0; i < RetLen; i++) + ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); + + return Success(APValue(ResultElements.data(), RetLen), E); } - - - case X86::BI__builtin_ia32_vpshldd128: case X86::BI__builtin_ia32_vpshldd256: From cc5b2938aeb828dbd58f84f0f5a1dc3dca2a4095 Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Tue, 16 Sep 2025 18:14:57 +0900 Subject: [PATCH 03/21] [clang] Implement constant evaluation for AVX extract intrinsics (part) --- clang/lib/AST/ExprConstant.cpp | 111 ++++++++++++++++++++++++++++++++- 1 file changed, 109 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 16567e56cc778..9598d72416f6e 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12028,7 +12028,114 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } - case X86::BI__builtin_ia32_extract128i256: + case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32 + case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32 + case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32 + case X86::BI__builtin_ia32_extracti64x2_256_mask: // _mm256_extracti64x2_epi64 + case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64 + case X86::BI__builtin_ia32_extracti64x4_mask: { // _mm512_extracti64x4_epi64 + APValue SourceVec, SourceImm, SourceMerge, SourceKmask; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || + !EvaluateAsRValue(Info, E->getArg(1), SourceImm) || + !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) || + !EvaluateAsRValue(Info, E->getArg(3), SourceKmask)) + return false; + + const auto *RetVT = E->getType()->castAs(); + QualType EltTy = RetVT->getElementType(); + unsigned RetLen = RetVT->getNumElements(); + + if (!SourceVec.isVector()) + return false; + unsigned SrcLen = SourceVec.getVectorLength(); + if (SrcLen % RetLen != 0) + return false; + + unsigned NumLanes = SrcLen / RetLen; + unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1); + + // Step 2) Apply kmask (covers plain/mask/maskz): + // - plain : headers pass kmask=all-ones; merge is undef → always take Extracted. + // - mask : merge=dst; take? Extracted[i] : dst[i] + // - maskz : merge=zero; take? Extracted[i] : 0 + uint64_t KmaskBits = SourceKmask.getInt().getZExtValue(); + + auto makeZeroInt = [&]() -> APValue { + bool Uns = EltTy->isUnsignedIntegerOrEnumerationType(); + unsigned BW = Info.Ctx.getIntWidth(EltTy); + return APValue(APSInt(APInt(BW, 0), Uns)); + }; + + SmallVector ResultElements; + ResultElements.reserve(RetLen); + for (unsigned i = 0; i < RetLen; i++) { + bool Take = (KmaskBits >> i) & 1; + if (Take) { + ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); + } else { + // For plain (all-ones) this path is never taken. + // For mask : merge is the original dst element. + // For maskz : headers pass zero vector as merge. + const APValue &MergeElt = + SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt(); + ResultElements.push_back(MergeElt); + } + } + return Success(APValue(ResultElements.data(), RetLen), E); + } + + case X86::BI__builtin_ia32_extractf32x4_256_mask: // _mm256_extractf32x4_ps _mm256_mask_extractf32x4_ps _mm256_maskz_extractf32x4_ps + case X86::BI__builtin_ia32_extractf32x4_mask: // _mm512_extractf32x4_ps _mm512_mask_extractf32x4_ps _mm512_maskz_extractf32x4_ps + case X86::BI__builtin_ia32_extractf32x8_mask: // _mm512_extractf32x8_ps _mm512_mask_extractf32x8_ps _mm512_maskz_extractf32x8_ps + + case X86::BI__builtin_ia32_extractf64x2_256_mask: // _mm256_extractf64x2_pd _mm256_mask_extractf64x2_pd _mm256_maskz_extractf64x2_pd + case X86::BI__builtin_ia32_extractf64x2_512_mask: // _mm512_extractf64x2_pd _mm512_mask_extractf64x2_pd _mm512_maskz_extractf64x2_pd + case X86::BI__builtin_ia32_extractf64x4_mask: { // _mm512_extractf64x4_pd _mm512_mask_extractf64x4_pd _mm512_maskz_extractf64x4_pd + APValue SourceVec, SourceImm, SourceMerge, SourceKmask; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || + !EvaluateAsRValue(Info, E->getArg(1), SourceImm) || + !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) || + !EvaluateAsRValue(Info, E->getArg(3), SourceKmask)) + return false; + + const auto *RetVT = E->getType()->castAs(); + QualType EltTy = RetVT->getElementType(); + unsigned RetLen = RetVT->getNumElements(); + + if (!SourceVec.isVector()) + return false; + unsigned SrcLen = SourceVec.getVectorLength(); + if (SrcLen % RetLen != 0) + return false; + + unsigned NumLanes = SrcLen / RetLen; + unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1); + + uint64_t KmaskBits = SourceKmask.getInt().getZExtValue(); + + auto makeZeroFP = [&]() -> APValue { + const llvm::fltSemantics &Sem = + Info.Ctx.getFloatTypeSemantics(EltTy); + return APValue(llvm::APFloat::getZero(Sem)); + }; + + SmallVector ResultElements; + ResultElements.reserve(RetLen); + for (unsigned i = 0; i < RetLen; i++) { + bool Take = (KmaskBits >> i) & 1; + if (Take) { + ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); + } else { + const APValue &MergeElt = + SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt(); + ResultElements.push_back(MergeElt); + } + } + return Success(APValue(ResultElements.data(), RetLen), E); + } + + // vector extract + case X86::BI__builtin_ia32_extract128i256: // avx2 case X86::BI__builtin_ia32_vextractf128_pd256: case X86::BI__builtin_ia32_vextractf128_ps256: case X86::BI__builtin_ia32_vextractf128_si256: { @@ -12044,7 +12151,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (SrcLen != RetLen * 2) return false; - SmallVector ResultElements; + SmallVector ResultElements; ResultElements.reserve(RetLen); for (unsigned i = 0; i < RetLen; i++) From 47f4ad54385644200d9ac6ca0a522b85aa1803b0 Mon Sep 17 00:00:00 2001 From: Yuriy Chernyshov Date: Mon, 22 Sep 2025 15:58:27 +0300 Subject: [PATCH 04/21] Add missing #include (#157840) std::realloc is declared there From e2f3ed27890f390e9ad7fb381af9ae43f09e300c Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Tue, 23 Sep 2025 15:26:23 +0900 Subject: [PATCH 05/21] WIP: in-progress changes --- clang/lib/AST/ExprConstant.cpp | 61 +++++++++++-------- .../test/SemaCXX/constexpr-avx-intrinsics.cpp | 25 -------- 2 files changed, 34 insertions(+), 52 deletions(-) delete mode 100644 clang/test/SemaCXX/constexpr-avx-intrinsics.cpp diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 9598d72416f6e..27728b64aa84b 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12028,6 +12028,39 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), ResultElements.size()), E); } + // vector extract + case X86::BI__builtin_ia32_extract128i256: + case X86::BI__builtin_ia32_vextractf128_pd256: + case X86::BI__builtin_ia32_vextractf128_ps256: + case X86::BI__builtin_ia32_vextractf128_si256: { + APValue SourceVec, SourceImm; + if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || + !EvaluateAsRValue(Info, E->getArg(1), SourceImm)) + return false; + + if (!SourceVec.isVector()) + return false; + + const auto *RetVT = E->getType()->castAs(); + if (!RetVT) return false; + + unsigned RetLen = RetVT->getNumElements(); + unsigned SrcLen = SourceVec.getVectorLength(); + if (SrcLen != RetLen * 2) + return false; + + unsigned idx = SourceImm.getInt().getZExtValue() & 1; + + SmallVector ResultElements; + ResultElements.reserve(RetLen); + + for (unsigned i = 0; i < RetLen; i++) + ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); + + return Success(APValue(ResultElements.data(), RetLen), E); + } + + // masked extract (ex: mm512_mask_extract32x4_epi32 / 512 -> 128) case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32 case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32 case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32 @@ -12127,39 +12160,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); } else { const APValue &MergeElt = - SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt(); + SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroFP(); ResultElements.push_back(MergeElt); } } return Success(APValue(ResultElements.data(), RetLen), E); } - // vector extract - case X86::BI__builtin_ia32_extract128i256: // avx2 - case X86::BI__builtin_ia32_vextractf128_pd256: - case X86::BI__builtin_ia32_vextractf128_ps256: - case X86::BI__builtin_ia32_vextractf128_si256: { - APValue SourceVec, SourceImm; - if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || - !EvaluateAsRValue(Info, E->getArg(1), SourceImm)) - return false; - - unsigned idx = SourceImm.getInt().getZExtValue() & 1; - const auto *RetVT = E->getType()->castAs(); - unsigned RetLen = RetVT->getNumElements(); - unsigned SrcLen = SourceVec.getVectorLength(); - if (SrcLen != RetLen * 2) - return false; - - SmallVector ResultElements; - ResultElements.reserve(RetLen); - - for (unsigned i = 0; i < RetLen; i++) - ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); - - return Success(APValue(ResultElements.data(), RetLen), E); - } - case X86::BI__builtin_ia32_vpshldd128: case X86::BI__builtin_ia32_vpshldd256: case X86::BI__builtin_ia32_vpshldd512: diff --git a/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp b/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp deleted file mode 100644 index 30e1340601255..0000000000000 --- a/clang/test/SemaCXX/constexpr-avx-intrinsics.cpp +++ /dev/null @@ -1,25 +0,0 @@ -// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s -// expected-no-diagnostics - -#include // AVX/AVX512 헤더 - -// // 테스트하려는 AVX/AVX512 내장 함수를 사용하는 constexpr 함수 -// constexpr int test_avx_subvector_extraction() { -// __m256i a = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - -// // 이슈의 핵심: 이 내장 함수 호출이 constexpr 문맥에서 가능해야 함 -// __m128i sub = _mm256_extracti128_si256(a, 0); - -// return _mm_cvtsi128_si32(sub); // 결과를 int로 변환하여 리턴 -// } - -// // 이 상수는 컴파일 시간에 평가되어야 함 -// constexpr int result = test_avx_subvector_extraction(); - -// static_assert(result == 0, "Incorrect result"); - -#include - -constexpr __m128 test(__m256 a) { - return _mm256_extractf128_ps(a, 1); -} \ No newline at end of file From 16db57d65106c345c60ed963cf7a4e276b1e17ec Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Wed, 24 Sep 2025 02:09:42 +0900 Subject: [PATCH 06/21] [clang][ByteCode] constexpr-enable X86 AVX/AVX512 subvector extract builtins in InterpBuiltin - Route AVX/AVX2 vextractf128/ extract128i256 to 2-arg extract helper. - Route all AVX-512(VL/DQ) extract builtins to unified 4-arg masked helper: * extractf32x4_{256,_} * extractf32x8_ * extractf64x2_{256,512} * extractf64x4_ * extracti32x4_{256,_} * extracti32x8_ * extracti64x2_{256,512} * extracti64x4_ - Implement mask/merge/all-ones(mask=plain)/maskz semantics. - Initialize all elements in the destination vector. NOTE: Tests are not included yet. This patch wires up InterpBuiltin support only. A follow-up patch will add constexpr tests under clang/test/AST/Interp/. --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 173 +++++++++++++++++++++++ 1 file changed, 173 insertions(+) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 922d67940e22f..c4040158ca440 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -712,6 +712,36 @@ static bool interp__builtin_expect(InterpState &S, CodePtr OpPC, return true; } + +/// rotateleft(value, amount) +static bool interp__builtin_rotate(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const CallExpr *Call, bool Right) { + APSInt Amount = popToAPSInt(S, Call->getArg(1)); + APSInt Value = popToAPSInt(S, Call->getArg(0)); + + APSInt Result; + if (Right) + Result = APSInt(Value.rotr(Amount.urem(Value.getBitWidth())), + /*IsUnsigned=*/true); + else // Left. + Result = APSInt(Value.rotl(Amount.urem(Value.getBitWidth())), + /*IsUnsigned=*/true); + + pushInteger(S, Result, Call->getType()); + return true; +} + +static bool interp__builtin_ffs(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const CallExpr *Call) { + APSInt Value = popToAPSInt(S, Call->getArg(0)); + + uint64_t N = Value.countr_zero(); + pushInteger(S, N == Value.getBitWidth() ? 0 : N + 1, Call->getType()); + return true; +} + static bool interp__builtin_addressof(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { @@ -2819,6 +2849,127 @@ static bool interp__builtin_elementwise_triop( return true; } +//_builtin_extract +static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC, + const CallExpr *Call, + unsigned ID) { + assert(Call->getNumArgs() == 2); + + // srcimm + APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); + uint64_t Index = ImmAPS.getZExtValue(); + + // srcvec + const Pointer &Src = S.Stk.pop(); + if (!Src.getFieldDesc()->isPrimitiveArray()) + return false; + + // destination (return value) + const Pointer &Dst = S.Stk.peek(); + if (!Dst.getFieldDesc()->isPrimitiveArray()) + return false; + + unsigned SrcElems = Src.getNumElems(); + unsigned DstElems = Dst.getNumElems(); + + if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0) + return false; + + unsigned NumLanes = SrcElems / DstElems; + unsigned Lane = static_cast(Index % NumLanes); + unsigned ExtractPos = Lane * DstElems; + + // element type + PrimType ElemPT = Src.getFieldDesc()->getPrimType(); + if (ElemPT != Dst.getFieldDesc()->getPrimType()) + return false; + + TYPE_SWITCH(ElemPT, { + for (unsigned I = 0; I != DstElems; ++I) { + Dst.elem(I) = Src.elem(ExtractPos + I); + } + }); + + Dst.initializeAllElements(); + return true; +} + +// __builtin_extract_masked +static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr OpPC, + const CallExpr *Call, + unsigned ID) { + assert(Call->getNumArgs() == 4); + + // kmask + APSInt KmaskAPS = popToAPSInt(S, Call->getArg(3)); + uint64_t Kmask = KmaskAPS.getZExtValue(); + + // merge + const Pointer &Merge = S.Stk.pop(); + bool HasMergeVec = Merge.isLive() && Merge.getFieldDesc() && + Merge.getFieldDesc()->isPrimitiveArray(); + + // srcimm + APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); + uint64_t Index = ImmAPS.getZExtValue(); + + // srcvec + const Pointer &Src = S.Stk.pop(); + if (!Src.getFieldDesc()->isPrimitiveArray()) + return false; + + // dst (return) + const Pointer &Dst = S.Stk.peek(); + if (!Dst.getFieldDesc()->isPrimitiveArray()) + return false; + + unsigned SrcElems = Src.getNumElems(); + unsigned DstElems = Dst.getNumElems(); + if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0) + return false; + + unsigned NumLanes = SrcElems / DstElems; + unsigned Lane = static_cast(Index % NumLanes); + unsigned ExtractPos = Lane * DstElems; + + PrimType ElemPT = Src.getFieldDesc()->getPrimType(); + if (ElemPT != Dst.getFieldDesc()->getPrimType()) + return false; + + // Merge vector type/len check(if) + if (HasMergeVec) { + if (Merge.getFieldDesc()->getPrimType() != ElemPT || + Merge.getNumElems() != DstElems) + return false; + } + + // generate 0 value + auto storeZeroAt = [&](unsigned I) { + TYPE_SWITCH(ElemPT, { + Dst.elem(I) = T{}; + }); + }; + + TYPE_SWITCH(ElemPT, { + for (unsigned I = 0; I != DstElems; ++I) { + bool Take = ((Kmask >> I) & 1) != 0; + if (Take) { + Dst.elem(I) = Src.elem(ExtractPos + I); + } else { + if (HasMergeVec) { + Dst.elem(I) = Merge.elem(I); + } else { + storeZeroAt(I); + } + } + } + }); + + Dst.initializeAllElements(); + return true; +} + + static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned ID) { @@ -3452,6 +3603,28 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS); }); + case X86::BI__builtin_ia32_extract128i256: // _mm256_extracti128 + case X86::BI__builtin_ia32_vextractf128_pd256: // _mm256_extractf128_ps + case X86::BI__builtin_ia32_vextractf128_ps256: // _mm256_extractf128_pd + case X86::BI__builtin_ia32_vextractf128_si256: // _mm256_extracti128_si256 + return interp__builtin_x86_extract_vector(S, OpPC, Call, BuiltinID); + + // AVX-512 / AVX-512VL / AVX-512DQ + case X86::BI__builtin_ia32_extractf32x4_256_mask: + case X86::BI__builtin_ia32_extractf32x4_mask: + case X86::BI__builtin_ia32_extractf32x8_mask: + case X86::BI__builtin_ia32_extractf64x2_256_mask: + case X86::BI__builtin_ia32_extractf64x2_512_mask: + case X86::BI__builtin_ia32_extractf64x4_mask: + case X86::BI__builtin_ia32_extracti32x4_256_mask: + case X86::BI__builtin_ia32_extracti32x4_mask: + case X86::BI__builtin_ia32_extracti32x8_mask: + case X86::BI__builtin_ia32_extracti64x2_256_mask: + case X86::BI__builtin_ia32_extracti64x2_512_mask: + case X86::BI__builtin_ia32_extracti64x4_mask: + return interp__builtin_x86_extract_vector_masked(S, OpPC, Call, BuiltinID); + + case clang::X86::BI__builtin_ia32_pavgb128: case clang::X86::BI__builtin_ia32_pavgw128: case clang::X86::BI__builtin_ia32_pavgb256: From 558e23804ab97873885dc72e61deec39280880a4 Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Wed, 24 Sep 2025 21:16:11 +0900 Subject: [PATCH 07/21] Remove commented code --- clang/lib/AST/ExprConstant.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 27728b64aa84b..7d2b341fe9ce7 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12029,7 +12029,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { } // vector extract - case X86::BI__builtin_ia32_extract128i256: + case X86::BI__builtin_ia32_extract128i256: // avx2 case X86::BI__builtin_ia32_vextractf128_pd256: case X86::BI__builtin_ia32_vextractf128_ps256: case X86::BI__builtin_ia32_vextractf128_si256: { @@ -12060,12 +12060,11 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), RetLen), E); } - // masked extract (ex: mm512_mask_extract32x4_epi32 / 512 -> 128) case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32 case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32 case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32 case X86::BI__builtin_ia32_extracti64x2_256_mask: // _mm256_extracti64x2_epi64 - case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64 + case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64 case X86::BI__builtin_ia32_extracti64x4_mask: { // _mm512_extracti64x4_epi64 APValue SourceVec, SourceImm, SourceMerge, SourceKmask; if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || From 315f686d5c59599a2b8e285e7d14ad52ac84469f Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Wed, 24 Sep 2025 21:20:45 +0900 Subject: [PATCH 08/21] Add constexpr tests for AVX/AVX2/AVX-512 extract intrinsics --- clang/test/CodeGen/X86/avx-builtins.c | 9 +++ clang/test/CodeGen/X86/avx2-builtins.c | 2 + clang/test/CodeGen/X86/avx512dq-builtins.c | 68 ++++++++++++++++++++ clang/test/CodeGen/X86/avx512f-builtins.c | 67 +++++++++++++++++++ clang/test/CodeGen/X86/avx512vl-builtins.c | 30 +++++++++ clang/test/CodeGen/X86/avx512vldq-builtins.c | 26 ++++++++ 6 files changed, 202 insertions(+) diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 5f08b6be81ab7..5aa69c75aea28 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1071,18 +1071,27 @@ __m128d test_mm256_extractf128_pd(__m256d A) { return _mm256_extractf128_pd(A, 1); } +TEST_CONSTEXPR(match_m128d(_mm256_extractf128_pd(((__m256d){0.0, 1.0, 2.0, 3.0}), 1), + 2.0, 3.0)); + __m128 test_mm256_extractf128_ps(__m256 A) { // CHECK-LABEL: test_mm256_extractf128_ps // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> return _mm256_extractf128_ps(A, 1); } +TEST_CONSTEXPR(match_m128(_mm256_extractf128_ps(((__m256){0,1,2,3,4,5,6,7}), 1), + 4.0f, 5.0f, 6.0f, 7.0f)); + __m128i test_mm256_extractf128_si256(__m256i A) { // CHECK-LABEL: test_mm256_extractf128_si256 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> return _mm256_extractf128_si256(A, 1); } +TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0,1,2,3,4,5,6,7}), 1), + 4, 5, 6, 7)); + __m256d test_mm256_floor_pd(__m256d x) { // CHECK-LABEL: test_mm256_floor_pd // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1) diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index 55f18f947b96f..c04d50c893c21 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -479,6 +479,8 @@ __m128i test2_mm256_extracti128_si256(__m256i a) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> return _mm256_extracti128_si256(a, 0); } +TEST_CONSTEXPR(match_m128i(_mm256_extracti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), 0), + 1ULL, 2ULL)); __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadd_epi16 diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index 4112561216af8..08013705875d0 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1402,6 +1402,11 @@ __m256 test_mm512_extractf32x8_ps(__m512 __A) { // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> return _mm512_extractf32x8_ps(__A, 1); } +TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){ + 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f, + 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f + }), 1), + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) { // CHECK-LABEL: test_mm512_mask_extractf32x8_ps @@ -1409,6 +1414,15 @@ __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); } +TEST_CONSTEXPR(match_m256(_mm512_mask_extractf32x8_ps( + (__m256){0,0,0,0,0,0,0,0}, // W + ((__mmask8)0xFF), // U = all ones (plain) + (__m512){ + 0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f, + 8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f + }, + 1), + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) { // CHECK-LABEL: test_mm512_maskz_extractf32x8_ps @@ -1416,12 +1430,24 @@ __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}} return _mm512_maskz_extractf32x8_ps(__U, __A, 1); } +TEST_CONSTEXPR(match_m256(_mm512_maskz_extractf32x8_ps( + ((__mmask8)0x0F), + (__m512){ + 0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f, + 8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f + }, + 1), + 8.0f, 9.0f, 10.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f)); __m128d test_mm512_extractf64x2_pd(__m512d __A) { // CHECK-LABEL: test_mm512_extractf64x2_pd // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> return _mm512_extractf64x2_pd(__A, 3); } +TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(((__m512d){ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 + }), 3), + 6.0, 7.0)); __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_mask_extractf64x2_pd @@ -1429,6 +1455,12 @@ __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); } +TEST_CONSTEXPR(match_m128d(_mm512_mask_extractf64x2_pd( + (__m128d){100.0, 101.0}, // W(merge) + (__mmask8)0x1, // 0000 0001b + (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, + 3), + 6.0, 101.0)); __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_maskz_extractf64x2_pd @@ -1436,12 +1468,21 @@ __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) { // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm512_maskz_extractf64x2_pd(__U, __A, 3); } +TEST_CONSTEXPR(match_m128d(_mm512_maskz_extractf64x2_pd( + (__mmask8)0x2, // 0000 0010b + (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, + 3), + 0.0, 7.0)); __m256i test_mm512_extracti32x8_epi32(__m512i __A) { // CHECK-LABEL: test_mm512_extracti32x8_epi32 // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> return _mm512_extracti32x8_epi32(__A, 1); } +TEST_CONSTEXPR(match_m256i(_mm512_extracti32x8_epi32(((__m512i){ + 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 + }), 1), + 8, 9,10,11,12,13,14,15)); __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti32x8_epi32 @@ -1449,6 +1490,13 @@ __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); } +TEST_CONSTEXPR(match_m256i(_mm512_mask_extracti32x8_epi32( + (__m256i){100,101,102,103,104,105,106,107}, // W(merge) + (__mmask8)0xAA, // 1010 1010b → only odd lanetake + (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, + 1), + // lane0..7: 8,9,10,11,12,13,14,15 + 100, 9, 102, 11, 104, 13, 106, 15)); __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_extracti32x8_epi32 @@ -1456,12 +1504,21 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); } +TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti32x8_epi32( + (__mmask8)0x0F, + (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, + 1), + 8, 9, 10, 11, 0, 0, 0, 0)); __m128i test_mm512_extracti64x2_epi64(__m512i __A) { // CHECK-LABEL: test_mm512_extracti64x2_epi64 // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> return _mm512_extracti64x2_epi64(__A, 3); } +TEST_CONSTEXPR(match_m128i_64(_mm512_extracti64x2_epi64(((__m512i){ + 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL + }), 3), + 6ULL, 7ULL)); __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti64x2_epi64 @@ -1469,6 +1526,12 @@ __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __ // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); } +TEST_CONSTEXPR(match_m128i_64(_mm512_mask_extracti64x2_epi64( + (__m128i){100ULL, 101ULL}, // W(merge) + (__mmask8)0x1, // lane0만 take + (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, + 3), + 6ULL, 101ULL)); __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_extracti64x2_epi64 @@ -1476,6 +1539,11 @@ __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) { // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); } +TEST_CONSTEXPR(match_m128i_64(_mm512_maskz_extracti64x2_epi64( + (__mmask8)0x2, // lane1 take, lane0 0 + (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, + 3), + 0ULL, 7ULL)); __m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) { // CHECK-LABEL: test_mm512_insertf32x8 diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index 7756f0da18c03..d37b22285174e 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -2452,6 +2452,11 @@ __m256d test_mm512_extractf64x4_pd(__m512d a) // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> return _mm512_extractf64x4_pd(a, 1); } +TEST_CONSTEXPR(match_m256d(_mm512_extractf64x4_pd(((__m512d){ + 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 + }), 1), + 4.0, 5.0, 6.0, 7.0)); + __m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){ // CHECK-LABEL: test_mm512_mask_extractf64x4_pd @@ -2459,6 +2464,12 @@ __m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){ // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1); } +TEST_CONSTEXPR(match_m256d(_mm512_mask_extractf64x4_pd( + (__m256d){100.0,101.0,102.0,103.0}, // W(merge) + (__mmask8)0x5, // 0101b + (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, + 1), + 4.0, 101.0, 6.0, 103.0)); __m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){ // CHECK-LABEL: test_mm512_maskz_extractf64x4_pd @@ -2466,6 +2477,11 @@ __m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){ // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm512_maskz_extractf64x4_pd( __U, __A, 1); } +TEST_CONSTEXPR(match_m256d(_mm512_maskz_extractf64x4_pd( + (__mmask8)0x3, + (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, + 1), + 4.0, 5.0, 0.0, 0.0)); __m128 test_mm512_extractf32x4_ps(__m512 a) { @@ -2473,6 +2489,10 @@ __m128 test_mm512_extractf32x4_ps(__m512 a) // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> return _mm512_extractf32x4_ps(a, 1); } +TEST_CONSTEXPR(match_m128(_mm512_extractf32x4_ps(((__m512){ + 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 + }), 1), + 4.0f, 5.0f, 6.0f, 7.0f)); __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){ // CHECK-LABEL: test_mm512_mask_extractf32x4_ps @@ -2480,6 +2500,12 @@ __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1); } +TEST_CONSTEXPR(match_m128(_mm512_mask_extractf32x4_ps( + (__m128){100,101,102,103}, // W(merge) + (__mmask8)0x5, // 0101b + (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}, + 1), + 4.0f, 101.0f, 6.0f, 103.0f)); __m128 test_mm512_maskz_extractf32x4_ps( __mmask8 __U,__m512 __A){ // CHECK-LABEL: test_mm512_maskz_extractf32x4_ps @@ -2487,6 +2513,11 @@ __m128 test_mm512_maskz_extractf32x4_ps( __mmask8 __U,__m512 __A){ // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm512_maskz_extractf32x4_ps(__U, __A, 1); } +TEST_CONSTEXPR(match_m128(_mm512_maskz_extractf32x4_ps( + (__mmask8)0x3, + (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}, + 1), + 4.0f, 5.0f, 0.0f, 0.0f)); __mmask16 test_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) { // CHECK-LABEL: test_mm512_cmpeq_epu32_mask @@ -7357,6 +7388,10 @@ __m128i test_mm512_extracti32x4_epi32(__m512i __A) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> return _mm512_extracti32x4_epi32(__A, 3); } +TEST_CONSTEXPR(match_m128i(_mm512_extracti32x4_epi32(((__m512i){ + 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 + }), 3), + 12, 13, 14, 15)); __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti32x4_epi32 @@ -7364,6 +7399,15 @@ __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __ // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); } +TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti32x4_epi32( + (__m128i){100,101,102,103}, // merge=W + (__mmask8)0x5, // 0101b + (__m512i){ + 0,1,2,3, 4,5,6,7, + 8,9,10,11, 12,13,14,15 + }, + 3), + 12, 101, 14, 103)); __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_extracti32x4_epi32 @@ -7371,12 +7415,24 @@ __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm512_maskz_extracti32x4_epi32(__U, __A, 3); } +TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti32x4_epi32( + (__mmask8)0x3, + (__m512i){ + 0,1,2,3, 4,5,6,7, + 8,9,10,11, 12,13,14,15 + }, + 3), +12, 13, 0, 0)); __m256i test_mm512_extracti64x4_epi64(__m512i __A) { // CHECK-LABEL: test_mm512_extracti64x4_epi64 // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> return _mm512_extracti64x4_epi64(__A, 1); } +TEST_CONSTEXPR(match_m256i(_mm512_extracti64x4_epi64(((__m512i){ + 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL + }), 1), + 4ULL, 5ULL, 6ULL, 7ULL)); __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti64x4_epi64 @@ -7384,6 +7440,12 @@ __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __ // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); } +TEST_CONSTEXPR(match_m256i_64(_mm512_mask_extracti64x4_epi64( + (__m256i){100ULL,101ULL,102ULL,103ULL}, + (__mmask8)0x5, + (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, + 1), + 4ULL, 101ULL, 6ULL, 103ULL)); __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_extracti64x4_epi64 @@ -7391,6 +7453,11 @@ __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm512_maskz_extracti64x4_epi64(__U, __A, 1); } +TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti64x4_epi64( + (__mmask8)0x3, + (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, + 1), + 4ULL, 5ULL, 0ULL, 0ULL)); __m512d test_mm512_insertf64x4(__m512d __A, __m256d __B) { // CHECK-LABEL: test_mm512_insertf64x4 diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 51385d57d2944..323ac1b2cab63 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -9875,6 +9875,10 @@ __m128 test_mm256_extractf32x4_ps(__m256 __A) { // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> return _mm256_extractf32x4_ps(__A, 1); } +TEST_CONSTEXPR(match_m128(_mm256_extractf32x4_ps(((__m256){ + 0,1,2,3, 4,5,6,7 + }), 1), + 4.0f, 5.0f, 6.0f, 7.0f)); __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_mask_extractf32x4_ps @@ -9882,6 +9886,12 @@ __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); } +TEST_CONSTEXPR( match_m128(_mm256_mask_extractf32x4_ps( + (__m128){100,101,102,103}, // W (merge) + (__mmask8)0x5, // 0101b + (__m256){0,1,2,3, 4,5,6,7}, + 1), + 4.0f, 101.0f, 6.0f, 103.0f)); __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_maskz_extractf32x4_ps @@ -9889,12 +9899,21 @@ __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm256_maskz_extractf32x4_ps(__U, __A, 1); } +TEST_CONSTEXPR(match_m128(_mm256_maskz_extractf32x4_ps( + (__mmask8)0x3, + (__m256){0,1,2,3, 4,5,6,7}, + 1), + 4.0f, 5.0f, 0.0f, 0.0f)); __m128i test_mm256_extracti32x4_epi32(__m256i __A) { // CHECK-LABEL: test_mm256_extracti32x4_epi32 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> return _mm256_extracti32x4_epi32(__A, 1); } +TEST_CONSTEXPR(match_m128i(_mm256_extracti32x4_epi32(((__m256i){ + 0,1,2,3, 4,5,6,7 + }), 1), + 4, 5, 6, 7)); __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_mask_extracti32x4_epi32 @@ -9902,6 +9921,12 @@ __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __ // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); } +TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti32x4_epi32( + (__m128i){100,101,102,103}, // W (merge) + (__mmask8)0xA, // 1010b + (__m256i){0,1,2,3, 4,5,6,7}, + 1), + 100, 5, 102, 7)); __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_extracti32x4_epi32 @@ -9909,6 +9934,11 @@ __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); } +TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti32x4_epi32( + (__mmask8)0x3, + (__m256i){0,1,2,3, 4,5,6,7}, + 1), + 4, 5, 0, 0)); __m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) { // CHECK-LABEL: test_mm256_insertf32x4 diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c index 938845799acf5..9cfcfea3dafc7 100644 --- a/clang/test/CodeGen/X86/avx512vldq-builtins.c +++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c @@ -1083,6 +1083,8 @@ __m128d test_mm256_extractf64x2_pd(__m256d __A) { // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> return _mm256_extractf64x2_pd(__A, 1); } +TEST_CONSTEXPR(match_m128d(_mm256_extractf64x2_pd(((__m256d){0.0,1.0,2.0,3.0}), 1), + 2.0, 3.0)); __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) { // CHECK-LABEL: test_mm256_mask_extractf64x2_pd @@ -1090,6 +1092,12 @@ __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); } +TEST_CONSTEXPR(match_m128d(_mm256_mask_extractf64x2_pd( + (__m128d){100.0, 101.0}, // W(merge) + (__mmask8)0x1, + (__m256d){0.0,1.0,2.0,3.0}, + 1), + 2.0, 101.0)); __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) { // CHECK-LABEL: test_mm256_maskz_extractf64x2_pd @@ -1097,12 +1105,19 @@ __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) { // CHECK: select <2 x i1> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}} return _mm256_maskz_extractf64x2_pd(__U, __A, 1); } +TEST_CONSTEXPR(match_m128d(_mm256_maskz_extractf64x2_pd( + (__mmask8)0x2, + (__m256d){0.0,1.0,2.0,3.0}, + 1), + 0.0, 3.0)); __m128i test_mm256_extracti64x2_epi64(__m256i __A) { // CHECK-LABEL: test_mm256_extracti64x2_epi64 // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> return _mm256_extracti64x2_epi64(__A, 1); } +TEST_CONSTEXPR(match_m128i_64(_mm256_extracti64x2_epi64(((__m256i){0ULL,1ULL,2ULL,3ULL}), 1), + 2ULL, 3ULL)); __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_mask_extracti64x2_epi64 @@ -1110,6 +1125,12 @@ __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __ // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); } +TEST_CONSTEXPR(match_m128i_64(_mm256_mask_extracti64x2_epi64( + (__m128i){100ULL, 101ULL}, // W(merge) + (__mmask8)0x1, + (__m256i){0ULL,1ULL,2ULL,3ULL}, + 1), + 2ULL, 101ULL)); __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_extracti64x2_epi64 @@ -1117,6 +1138,11 @@ __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) { // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); } +TEST_CONSTEXPR(match_m128i_64(_mm256_maskz_extracti64x2_epi64( + (__mmask8)0x2, + (__m256i){0ULL,1ULL,2ULL,3ULL}, + 1), + 0ULL, 3ULL)); __m256d test_mm256_insertf64x2(__m256d __A, __m128d __B) { // CHECK-LABEL: test_mm256_insertf64x2 From 8a8b202795fe430b20b7249660ae6dec2cb9199c Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Fri, 26 Sep 2025 11:17:39 +0900 Subject: [PATCH 09/21] Refactoring --- clang/include/clang/Basic/BuiltinsX86.td | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td index 217589d7add1d..909bb3b51b8b1 100644 --- a/clang/include/clang/Basic/BuiltinsX86.td +++ b/clang/include/clang/Basic/BuiltinsX86.td @@ -481,9 +481,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">; def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">; def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">; - def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">; - def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">; - def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">; def cvtpd2ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, double>)">; def cvtps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">; def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">; @@ -504,6 +501,9 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">; def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">; def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">; + def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">; + def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">; + def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">; def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">; def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">; def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">; @@ -607,7 +607,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">; def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">; - def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">; } @@ -652,6 +651,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi def packsswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; def packssdw256 : X86Builtin<"_Vector<16, short>(_Vector<8, int>, _Vector<8, int>)">; def packuswb256 : X86Builtin<"_Vector<32, char>(_Vector<16, short>, _Vector<16, short>)">; + def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">; def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">; @@ -1065,7 +1065,7 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256 def alignq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def extractf64x4_mask : X86Builtin<"_Vector<4, double>(_Vector<8, double>, _Constant int, _Vector<4, double>, unsigned char)">; def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">; } @@ -2944,24 +2944,24 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in { def pmovqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">; } -let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def extractf32x8_mask : X86Builtin<"_Vector<8, float>(_Vector<16, float>, _Constant int, _Vector<8, float>, unsigned char)">; def extractf64x2_512_mask : X86Builtin<"_Vector<2, double>(_Vector<8, double>, _Constant int, _Vector<2, double>, unsigned char)">; def extracti32x8_mask : X86Builtin<"_Vector<8, int>(_Vector<16, int>, _Constant int, _Vector<8, int>, unsigned char)">; def extracti64x2_512_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">; } -let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in { +let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in { def extracti32x4_mask : X86Builtin<"_Vector<4, int>(_Vector<16, int>, _Constant int, _Vector<4, int>, unsigned char)">; def extracti64x4_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, long long int>, _Constant int, _Vector<4, long long int>, unsigned char)">; } -let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def extractf64x2_256_mask : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int, _Vector<2, double>, unsigned char)">; def extracti64x2_256_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">; } -let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in { +let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in { def extractf32x4_256_mask : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int, _Vector<4, float>, unsigned char)">; def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">; } From 016eaec9c2c9a2a84ce1b7f364060a84ecbd35be Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Fri, 26 Sep 2025 11:20:04 +0900 Subject: [PATCH 10/21] Refactoring and Test Pass --- clang/test/CodeGen/X86/avx2-builtins.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c index c04d50c893c21..de33b72995f5c 100644 --- a/clang/test/CodeGen/X86/avx2-builtins.c +++ b/clang/test/CodeGen/X86/avx2-builtins.c @@ -466,6 +466,8 @@ __m128i test0_mm256_extracti128_si256_0(__m256i a) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> return _mm256_extracti128_si256(a, 0); } +TEST_CONSTEXPR(match_m128i(_mm256_extracti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), 0), + 1ULL, 2ULL)); __m128i test1_mm256_extracti128_si256_1(__m256i a) { // CHECK-LABEL: test1_mm256_extracti128_si256 @@ -479,8 +481,6 @@ __m128i test2_mm256_extracti128_si256(__m256i a) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> return _mm256_extracti128_si256(a, 0); } -TEST_CONSTEXPR(match_m128i(_mm256_extracti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), 0), - 1ULL, 2ULL)); __m256i test_mm256_hadd_epi16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_hadd_epi16 From 1b1987a02e34451bda66070d93465271f5840943 Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Fri, 26 Sep 2025 11:20:56 +0900 Subject: [PATCH 11/21] Refactoring and Test Pass --- clang/test/CodeGen/X86/avx-builtins.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c index 5aa69c75aea28..7765468a9472a 100644 --- a/clang/test/CodeGen/X86/avx-builtins.c +++ b/clang/test/CodeGen/X86/avx-builtins.c @@ -1070,7 +1070,6 @@ __m128d test_mm256_extractf128_pd(__m256d A) { // CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> poison, <2 x i32> return _mm256_extractf128_pd(A, 1); } - TEST_CONSTEXPR(match_m128d(_mm256_extractf128_pd(((__m256d){0.0, 1.0, 2.0, 3.0}), 1), 2.0, 3.0)); @@ -1079,7 +1078,6 @@ __m128 test_mm256_extractf128_ps(__m256 A) { // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> return _mm256_extractf128_ps(A, 1); } - TEST_CONSTEXPR(match_m128(_mm256_extractf128_ps(((__m256){0,1,2,3,4,5,6,7}), 1), 4.0f, 5.0f, 6.0f, 7.0f)); @@ -1088,9 +1086,8 @@ __m128i test_mm256_extractf128_si256(__m256i A) { // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> return _mm256_extractf128_si256(A, 1); } - -TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0,1,2,3,4,5,6,7}), 1), - 4, 5, 6, 7)); +TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0ULL, 1ULL, 2ULL, 3ULL}), 1), + 2ULL, 3ULL)); __m256d test_mm256_floor_pd(__m256d x) { // CHECK-LABEL: test_mm256_floor_pd From 1a7013cc5ea89f07518a95c4a653195db503e68a Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Sat, 27 Sep 2025 15:11:00 +0900 Subject: [PATCH 12/21] Refactoring and add avx512dq test --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 99 +++++++++++++--------- clang/lib/AST/ExprConstant.cpp | 75 +++------------- clang/test/CodeGen/X86/avx512dq-builtins.c | 70 ++++++++------- 3 files changed, 106 insertions(+), 138 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index c4040158ca440..60ba4a06bf357 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2896,71 +2896,94 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC, // __builtin_extract_masked static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr OpPC, - const CallExpr *Call, - unsigned ID) { - assert(Call->getNumArgs() == 4); + const CallExpr *Call, + unsigned ID) { + unsigned NumArgs = Call->getNumArgs(); - // kmask - APSInt KmaskAPS = popToAPSInt(S, Call->getArg(3)); - uint64_t Kmask = KmaskAPS.getZExtValue(); + const Pointer &Dst = S.Stk.peek(); + if (!Dst.getFieldDesc()->isPrimitiveArray()) + return false; - // merge - const Pointer &Merge = S.Stk.pop(); - bool HasMergeVec = Merge.isLive() && Merge.getFieldDesc() && - Merge.getFieldDesc()->isPrimitiveArray(); - - // srcimm - APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); - uint64_t Index = ImmAPS.getZExtValue(); + const Pointer *Merge = nullptr; + uint64_t Kmask = 0; + uint64_t Imm = 0; + const Pointer *Src = nullptr; - // srcvec - const Pointer &Src = S.Stk.pop(); - if (!Src.getFieldDesc()->isPrimitiveArray()) + if (NumArgs == 4) { + // __m256 _mm512_mask_extractf32x8_ps(W, U, A, imm) + APSInt ImmAPS = popToAPSInt(S, Call->getArg(3)); + Imm = ImmAPS.getZExtValue(); + + const Pointer &SrcP = S.Stk.pop(); + Src = &SrcP; + + APSInt KmaskAPS = popToAPSInt(S, Call->getArg(1)); + Kmask = KmaskAPS.getZExtValue(); + + const Pointer &MergeP = S.Stk.pop(); + Merge = &MergeP; + + } else if (NumArgs == 3) { + // __m256 _mm512_maskz_extractf32x8_ps(U, A, imm) + APSInt ImmAPS = popToAPSInt(S, Call->getArg(2)); + Imm = ImmAPS.getZExtValue(); + + const Pointer &SrcP = S.Stk.pop(); + Src = &SrcP; + + APSInt KmaskAPS = popToAPSInt(S, Call->getArg(0)); + Kmask = KmaskAPS.getZExtValue(); + + Merge = nullptr; // maskz → zero fill + } else { return false; + } - // dst (return) - const Pointer &Dst = S.Stk.peek(); - if (!Dst.getFieldDesc()->isPrimitiveArray()) + if (!Src->getFieldDesc()->isPrimitiveArray()) return false; - unsigned SrcElems = Src.getNumElems(); + unsigned SrcElems = Src->getNumElems(); unsigned DstElems = Dst.getNumElems(); if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0) return false; unsigned NumLanes = SrcElems / DstElems; - unsigned Lane = static_cast(Index % NumLanes); + unsigned Lane = static_cast(Imm % NumLanes); unsigned ExtractPos = Lane * DstElems; - PrimType ElemPT = Src.getFieldDesc()->getPrimType(); + PrimType ElemPT = Src->getFieldDesc()->getPrimType(); if (ElemPT != Dst.getFieldDesc()->getPrimType()) return false; - // Merge vector type/len check(if) - if (HasMergeVec) { - if (Merge.getFieldDesc()->getPrimType() != ElemPT || - Merge.getNumElems() != DstElems) - return false; - } + // --- 여기서 fast-path 추가 --- + unsigned UsedBits = std::min(DstElems, 64); // mask 폭 제한 + uint64_t AllOnes = (UsedBits == 64 ? ~0ull : ((1ull << UsedBits) - 1)); + bool MaskAll = (Kmask & AllOnes) == AllOnes; - // generate 0 value - auto storeZeroAt = [&](unsigned I) { + if (MaskAll) { + // merge는 무시, src에서 그대로 복사 TYPE_SWITCH(ElemPT, { - Dst.elem(I) = T{}; + for (unsigned I = 0; I != DstElems; ++I) + Dst.elem(I) = Src->elem(ExtractPos + I); }); + Dst.initializeAllElements(); + return true; + } + // --- fast-path 끝 --- + + auto storeZeroAt = [&](unsigned I) { + TYPE_SWITCH(ElemPT, { Dst.elem(I) = T{}; }); }; TYPE_SWITCH(ElemPT, { for (unsigned I = 0; I != DstElems; ++I) { bool Take = ((Kmask >> I) & 1) != 0; if (Take) { - Dst.elem(I) = Src.elem(ExtractPos + I); + Dst.elem(I) = Src->elem(ExtractPos + I); + } else if (Merge) { + Dst.elem(I) = Merge->elem(I); } else { - if (HasMergeVec) { - Dst.elem(I) = Merge.elem(I); - } else { - storeZeroAt(I); - } + storeZeroAt(I); } } }); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 7d2b341fe9ce7..22057955d5160 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12060,12 +12060,16 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), RetLen), E); } - case X86::BI__builtin_ia32_extracti32x4_256_mask: // _mm256_extracti32x4_epi32 - case X86::BI__builtin_ia32_extracti32x4_mask: // _mm512_extracti32x4_epi32 - case X86::BI__builtin_ia32_extracti32x8_mask: // _mm512_extracti32x8_epi32 - case X86::BI__builtin_ia32_extracti64x2_256_mask: // _mm256_extracti64x2_epi64 - case X86::BI__builtin_ia32_extracti64x2_512_mask: // _mm512_extracti64x2_epi64 - case X86::BI__builtin_ia32_extracti64x4_mask: { // _mm512_extracti64x4_epi64 + case X86::BI__builtin_ia32_extracti32x4_256_mask: + case X86::BI__builtin_ia32_extractf32x4_256_mask: + case X86::BI__builtin_ia32_extracti32x4_mask: + case X86::BI__builtin_ia32_extractf32x4_mask: + case X86::BI__builtin_ia32_extracti32x8_mask: + case X86::BI__builtin_ia32_extractf32x8_mask: + case X86::BI__builtin_ia32_extracti64x2_256_mask: + case X86::BI__builtin_ia32_extractf64x2_256_mask: + case X86::BI__builtin_ia32_extracti64x2_512_mask: + case X86::BI__builtin_ia32_extractf64x2_512_mask: { APValue SourceVec, SourceImm, SourceMerge, SourceKmask; if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || !EvaluateAsRValue(Info, E->getArg(1), SourceImm) || @@ -12085,11 +12089,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { unsigned NumLanes = SrcLen / RetLen; unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1); - - // Step 2) Apply kmask (covers plain/mask/maskz): - // - plain : headers pass kmask=all-ones; merge is undef → always take Extracted. - // - mask : merge=dst; take? Extracted[i] : dst[i] - // - maskz : merge=zero; take? Extracted[i] : 0 + uint64_t KmaskBits = SourceKmask.getInt().getZExtValue(); auto makeZeroInt = [&]() -> APValue { @@ -12105,9 +12105,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (Take) { ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); } else { - // For plain (all-ones) this path is never taken. - // For mask : merge is the original dst element. - // For maskz : headers pass zero vector as merge. + const APValue &MergeElt = SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt(); ResultElements.push_back(MergeElt); @@ -12116,55 +12114,6 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { return Success(APValue(ResultElements.data(), RetLen), E); } - case X86::BI__builtin_ia32_extractf32x4_256_mask: // _mm256_extractf32x4_ps _mm256_mask_extractf32x4_ps _mm256_maskz_extractf32x4_ps - case X86::BI__builtin_ia32_extractf32x4_mask: // _mm512_extractf32x4_ps _mm512_mask_extractf32x4_ps _mm512_maskz_extractf32x4_ps - case X86::BI__builtin_ia32_extractf32x8_mask: // _mm512_extractf32x8_ps _mm512_mask_extractf32x8_ps _mm512_maskz_extractf32x8_ps - - case X86::BI__builtin_ia32_extractf64x2_256_mask: // _mm256_extractf64x2_pd _mm256_mask_extractf64x2_pd _mm256_maskz_extractf64x2_pd - case X86::BI__builtin_ia32_extractf64x2_512_mask: // _mm512_extractf64x2_pd _mm512_mask_extractf64x2_pd _mm512_maskz_extractf64x2_pd - case X86::BI__builtin_ia32_extractf64x4_mask: { // _mm512_extractf64x4_pd _mm512_mask_extractf64x4_pd _mm512_maskz_extractf64x4_pd - APValue SourceVec, SourceImm, SourceMerge, SourceKmask; - if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || - !EvaluateAsRValue(Info, E->getArg(1), SourceImm) || - !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) || - !EvaluateAsRValue(Info, E->getArg(3), SourceKmask)) - return false; - - const auto *RetVT = E->getType()->castAs(); - QualType EltTy = RetVT->getElementType(); - unsigned RetLen = RetVT->getNumElements(); - - if (!SourceVec.isVector()) - return false; - unsigned SrcLen = SourceVec.getVectorLength(); - if (SrcLen % RetLen != 0) - return false; - - unsigned NumLanes = SrcLen / RetLen; - unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1); - - uint64_t KmaskBits = SourceKmask.getInt().getZExtValue(); - - auto makeZeroFP = [&]() -> APValue { - const llvm::fltSemantics &Sem = - Info.Ctx.getFloatTypeSemantics(EltTy); - return APValue(llvm::APFloat::getZero(Sem)); - }; - - SmallVector ResultElements; - ResultElements.reserve(RetLen); - for (unsigned i = 0; i < RetLen; i++) { - bool Take = (KmaskBits >> i) & 1; - if (Take) { - ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); - } else { - const APValue &MergeElt = - SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroFP(); - ResultElements.push_back(MergeElt); - } - } - return Success(APValue(ResultElements.data(), RetLen), E); - } case X86::BI__builtin_ia32_vpshldd128: case X86::BI__builtin_ia32_vpshldd256: diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index 08013705875d0..5a61040db9ef3 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1415,12 +1415,12 @@ __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) { return _mm512_mask_extractf32x8_ps(__W, __U, __A, 1); } TEST_CONSTEXPR(match_m256(_mm512_mask_extractf32x8_ps( - (__m256){0,0,0,0,0,0,0,0}, // W - ((__mmask8)0xFF), // U = all ones (plain) - (__m512){ + ((__m256)(__v8sf){0,0,0,0,0,0,0,0}), // W + (__mmask8)0xFF, + ((__m512)(__v16sf){ 0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f, 8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f - }, + }), 1), 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); @@ -1431,11 +1431,11 @@ __m256 test_mm512_maskz_extractf32x8_ps(__mmask8 __U, __m512 __A) { return _mm512_maskz_extractf32x8_ps(__U, __A, 1); } TEST_CONSTEXPR(match_m256(_mm512_maskz_extractf32x8_ps( - ((__mmask8)0x0F), - (__m512){ + (__mmask8)0x0F, + ((__m512)(__v16sf){ 0.0f,1.0f,2.0f,3.0f,4.0f,5.0f,6.0f,7.0f, 8.0f,9.0f,10.0f,11.0f,12.0f,13.0f,14.0f,15.0f - }, + }), 1), 8.0f, 9.0f, 10.0f, 11.0f, 0.0f, 0.0f, 0.0f, 0.0f)); @@ -1444,9 +1444,8 @@ __m128d test_mm512_extractf64x2_pd(__m512d __A) { // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> return _mm512_extractf64x2_pd(__A, 3); } -TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd(((__m512d){ - 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 - }), 3), +TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd( + ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3), 6.0, 7.0)); __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) { @@ -1456,9 +1455,9 @@ __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) return _mm512_mask_extractf64x2_pd(__W, __U, __A, 3); } TEST_CONSTEXPR(match_m128d(_mm512_mask_extractf64x2_pd( - (__m128d){100.0, 101.0}, // W(merge) - (__mmask8)0x1, // 0000 0001b - (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, + ((__m128d)(__v2df){100.0, 101.0}), // W(merge) + (__mmask8)0x1, + ((__m512d)(__v8df){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3), 6.0, 101.0)); @@ -1469,8 +1468,8 @@ __m128d test_mm512_maskz_extractf64x2_pd(__mmask8 __U, __m512d __A) { return _mm512_maskz_extractf64x2_pd(__U, __A, 3); } TEST_CONSTEXPR(match_m128d(_mm512_maskz_extractf64x2_pd( - (__mmask8)0x2, // 0000 0010b - (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, + (__mmask8)0x2, + ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3), 0.0, 7.0)); @@ -1479,9 +1478,8 @@ __m256i test_mm512_extracti32x8_epi32(__m512i __A) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> return _mm512_extracti32x8_epi32(__A, 1); } -TEST_CONSTEXPR(match_m256i(_mm512_extracti32x8_epi32(((__m512i){ - 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 - }), 1), +TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32( + ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), 8, 9,10,11,12,13,14,15)); __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) { @@ -1490,12 +1488,11 @@ __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __ // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm512_mask_extracti32x8_epi32(__W, __U, __A, 1); } -TEST_CONSTEXPR(match_m256i(_mm512_mask_extracti32x8_epi32( - (__m256i){100,101,102,103,104,105,106,107}, // W(merge) - (__mmask8)0xAA, // 1010 1010b → only odd lanetake - (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, +TEST_CONSTEXPR(match_v8si(_mm512_mask_extracti32x8_epi32( + ((__m256i)(__v8si){100,101,102,103,104,105,106,107}), + (__mmask8)0xAA, + ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), - // lane0..7: 8,9,10,11,12,13,14,15 100, 9, 102, 11, 104, 13, 106, 15)); __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) { @@ -1504,9 +1501,9 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); } -TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti32x8_epi32( - (__mmask8)0x0F, - (__m512i){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 }, +TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32( + (__mmask8)0x0F, + ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), 8, 9, 10, 11, 0, 0, 0, 0)); @@ -1515,9 +1512,8 @@ __m128i test_mm512_extracti64x2_epi64(__m512i __A) { // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> return _mm512_extracti64x2_epi64(__A, 3); } -TEST_CONSTEXPR(match_m128i_64(_mm512_extracti64x2_epi64(((__m512i){ - 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL - }), 3), +TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64( + ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), 6ULL, 7ULL)); __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) { @@ -1526,10 +1522,10 @@ __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __ // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm512_mask_extracti64x2_epi64(__W, __U, __A, 3); } -TEST_CONSTEXPR(match_m128i_64(_mm512_mask_extracti64x2_epi64( - (__m128i){100ULL, 101ULL}, // W(merge) - (__mmask8)0x1, // lane0만 take - (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, +TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti64x2_epi64( + ((__m128i)(__v2di){100ULL, 101ULL}), + (__mmask8)0x1, + ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), 6ULL, 101ULL)); @@ -1539,11 +1535,11 @@ __m128i test_mm512_maskz_extracti64x2_epi64(__mmask8 __U, __m512i __A) { // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm512_maskz_extracti64x2_epi64(__U, __A, 3); } -TEST_CONSTEXPR(match_m128i_64(_mm512_maskz_extracti64x2_epi64( - (__mmask8)0x2, // lane1 take, lane0 0 - (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, +TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti64x2_epi64( + (__mmask8)0x2, + ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), - 0ULL, 7ULL)); + 0ULL, 7ULL)) __m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) { // CHECK-LABEL: test_mm512_insertf32x8 From b002c17acde0d58809abeaa9bb0e25b1a2b928d5 Mon Sep 17 00:00:00 2001 From: seongjaep Date: Sun, 28 Sep 2025 18:53:45 +0900 Subject: [PATCH 13/21] no mask version test --- clang/test/CodeGen/X86/avx512dq-builtins.c | 40 +++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index 5a61040db9ef3..e9f344b240329 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1402,11 +1402,11 @@ __m256 test_mm512_extractf32x8_ps(__m512 __A) { // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> return _mm512_extractf32x8_ps(__A, 1); } -TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){ - 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f, - 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f - }), 1), - 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); +// TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){ +// 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f, +// 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f +// }), 1), +// 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) { // CHECK-LABEL: test_mm512_mask_extractf32x8_ps @@ -1444,9 +1444,9 @@ __m128d test_mm512_extractf64x2_pd(__m512d __A) { // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> return _mm512_extractf64x2_pd(__A, 3); } -TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd( - ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3), - 6.0, 7.0)); +// TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd( +// ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3), +// 6.0, 7.0)); __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_mask_extractf64x2_pd @@ -1478,9 +1478,9 @@ __m256i test_mm512_extracti32x8_epi32(__m512i __A) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> return _mm512_extracti32x8_epi32(__A, 1); } -TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32( - ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), - 8, 9,10,11,12,13,14,15)); +// TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32( +// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), +// 8, 9,10,11,12,13,14,15)); __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti32x8_epi32 @@ -1501,20 +1501,20 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); } -TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32( - (__mmask8)0x0F, - ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), - 1), - 8, 9, 10, 11, 0, 0, 0, 0)); +// TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32( +// (__mmask8)0x0F, +// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), +// 1), +// 8, 9, 10, 11, 0, 0, 0, 0)); __m128i test_mm512_extracti64x2_epi64(__m512i __A) { // CHECK-LABEL: test_mm512_extracti64x2_epi64 // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> return _mm512_extracti64x2_epi64(__A, 3); } -TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64( - ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), - 6ULL, 7ULL)); +// TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64( +// ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), +// 6ULL, 7ULL)); __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti64x2_epi64 @@ -1539,7 +1539,7 @@ TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti64x2_epi64( (__mmask8)0x2, ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), - 0ULL, 7ULL)) + 0ULL, 7ULL)); __m512 test_mm512_insertf32x8(__m512 __A, __m256 __B) { // CHECK-LABEL: test_mm512_insertf32x8 From de74751c42efb89af7a8c0c60f93f008fedb0f30 Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Mon, 29 Sep 2025 15:54:18 +0900 Subject: [PATCH 14/21] fix for test undefined -> setzero --- clang/lib/Headers/avx512dqintrin.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Headers/avx512dqintrin.h b/clang/lib/Headers/avx512dqintrin.h index fb65bf933b8ad..0ff776b36436e 100644 --- a/clang/lib/Headers/avx512dqintrin.h +++ b/clang/lib/Headers/avx512dqintrin.h @@ -1214,7 +1214,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) #define _mm512_extractf32x8_ps(A, imm) \ ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)_mm256_undefined_ps(), \ + (__v8sf)_mm256_setzero_ps(), \ (__mmask8)-1)) #define _mm512_mask_extractf32x8_ps(W, U, A, imm) \ @@ -1230,7 +1230,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) #define _mm512_extractf64x2_pd(A, imm) \ ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ (int)(imm), \ - (__v2df)_mm_undefined_pd(), \ + (__v2df)_mm_setzero_pd(), \ (__mmask8)-1)) #define _mm512_mask_extractf64x2_pd(W, U, A, imm) \ @@ -1247,7 +1247,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) #define _mm512_extracti32x8_epi32(A, imm) \ ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)_mm256_undefined_si256(), \ + (__v8si)_mm256_setzero_si256(), \ (__mmask8)-1)) #define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \ @@ -1263,7 +1263,7 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) #define _mm512_extracti64x2_epi64(A, imm) \ ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ (int)(imm), \ - (__v2di)_mm_undefined_si128(), \ + (__v2di)_mm_setzero_si128(), \ (__mmask8)-1)) #define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \ From 5e3c103944ddae98c62f4fa80f1118e272cf535b Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Mon, 29 Sep 2025 23:35:06 +0900 Subject: [PATCH 15/21] refactoring --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 99 ++++++---------------- clang/lib/AST/ExprConstant.cpp | 60 ++++++------- clang/test/CodeGen/X86/avx512dq-builtins.c | 38 ++++----- 3 files changed, 68 insertions(+), 129 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 60ba4a06bf357..05ef09b3cbaee 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2894,97 +2894,46 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC, return true; } -// __builtin_extract_masked static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned ID) { - unsigned NumArgs = Call->getNumArgs(); - - const Pointer &Dst = S.Stk.peek(); - if (!Dst.getFieldDesc()->isPrimitiveArray()) - return false; - - const Pointer *Merge = nullptr; - uint64_t Kmask = 0; - uint64_t Imm = 0; - const Pointer *Src = nullptr; - - if (NumArgs == 4) { - // __m256 _mm512_mask_extractf32x8_ps(W, U, A, imm) - APSInt ImmAPS = popToAPSInt(S, Call->getArg(3)); - Imm = ImmAPS.getZExtValue(); - - const Pointer &SrcP = S.Stk.pop(); - Src = &SrcP; - - APSInt KmaskAPS = popToAPSInt(S, Call->getArg(1)); - Kmask = KmaskAPS.getZExtValue(); + assert(Call->getNumArgs() == 4); - const Pointer &MergeP = S.Stk.pop(); - Merge = &MergeP; - - } else if (NumArgs == 3) { - // __m256 _mm512_maskz_extractf32x8_ps(U, A, imm) - APSInt ImmAPS = popToAPSInt(S, Call->getArg(2)); - Imm = ImmAPS.getZExtValue(); - - const Pointer &SrcP = S.Stk.pop(); - Src = &SrcP; - - APSInt KmaskAPS = popToAPSInt(S, Call->getArg(0)); - Kmask = KmaskAPS.getZExtValue(); + APSInt UAPS = popToAPSInt(S, Call->getArg(3)); + const Pointer &W = S.Stk.pop(); + APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); + const Pointer &A = S.Stk.pop(); - Merge = nullptr; // maskz → zero fill - } else { + if (!A.getFieldDesc()->isPrimitiveArray() || !W.getFieldDesc()->isPrimitiveArray()) return false; - } - if (!Src->getFieldDesc()->isPrimitiveArray()) + const Pointer &Dst = S.Stk.peek(); + if (!Dst.getFieldDesc()->isPrimitiveArray()) return false; - unsigned SrcElems = Src->getNumElems(); + unsigned SrcElems = A.getNumElems(); unsigned DstElems = Dst.getNumElems(); - if (SrcElems == 0 || DstElems == 0 || (SrcElems % DstElems) != 0) + if (!SrcElems || !DstElems || (SrcElems % DstElems) != 0) return false; - unsigned NumLanes = SrcElems / DstElems; - unsigned Lane = static_cast(Imm % NumLanes); - unsigned ExtractPos = Lane * DstElems; - - PrimType ElemPT = Src->getFieldDesc()->getPrimType(); - if (ElemPT != Dst.getFieldDesc()->getPrimType()) + // 타입 일치 체크 + PrimType PT = A.getFieldDesc()->getPrimType(); + if (PT != Dst.getFieldDesc()->getPrimType() || + PT != W.getFieldDesc()->getPrimType()) return false; - // --- 여기서 fast-path 추가 --- - unsigned UsedBits = std::min(DstElems, 64); // mask 폭 제한 - uint64_t AllOnes = (UsedBits == 64 ? ~0ull : ((1ull << UsedBits) - 1)); - bool MaskAll = (Kmask & AllOnes) == AllOnes; + unsigned numLanes = SrcElems / DstElems; + unsigned lane = static_cast(ImmAPS.getZExtValue() % numLanes); + unsigned base = lane * DstElems; - if (MaskAll) { - // merge는 무시, src에서 그대로 복사 - TYPE_SWITCH(ElemPT, { - for (unsigned I = 0; I != DstElems; ++I) - Dst.elem(I) = Src->elem(ExtractPos + I); - }); - Dst.initializeAllElements(); - return true; - } - // --- fast-path 끝 --- - - auto storeZeroAt = [&](unsigned I) { - TYPE_SWITCH(ElemPT, { Dst.elem(I) = T{}; }); - }; + uint64_t U = UAPS.getZExtValue(); - TYPE_SWITCH(ElemPT, { - for (unsigned I = 0; I != DstElems; ++I) { - bool Take = ((Kmask >> I) & 1) != 0; - if (Take) { - Dst.elem(I) = Src->elem(ExtractPos + I); - } else if (Merge) { - Dst.elem(I) = Merge->elem(I); - } else { - storeZeroAt(I); - } + TYPE_SWITCH(PT, { + for (unsigned i = 0; i < DstElems; ++i) { + if ((U >> i) & 1) + Dst.elem(i) = A.elem(base + i); + else + Dst.elem(i) = W.elem(i); } }); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 22057955d5160..327265b79d101 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12069,49 +12069,39 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_extracti64x2_256_mask: case X86::BI__builtin_ia32_extractf64x2_256_mask: case X86::BI__builtin_ia32_extracti64x2_512_mask: - case X86::BI__builtin_ia32_extractf64x2_512_mask: { - APValue SourceVec, SourceImm, SourceMerge, SourceKmask; - if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || - !EvaluateAsRValue(Info, E->getArg(1), SourceImm) || - !EvaluateAsRValue(Info, E->getArg(2), SourceMerge) || - !EvaluateAsRValue(Info, E->getArg(3), SourceKmask)) - return false; + case X86::BI__builtin_ia32_extractf64x2_512_mask: + case X86::BI__builtin_ia32_extractf64x4_mask:{ + APValue A, W; + APSInt Imm, U; + + if (!EvaluateAsRValue(Info, E->getArg(0), A) || // A + !EvaluateInteger(E->getArg(1), Imm, Info) || // imm + !EvaluateAsRValue(Info, E->getArg(2), W) || // W (merge) + !EvaluateInteger(E->getArg(3), U, Info)) // U (mask) + return false; const auto *RetVT = E->getType()->castAs(); - QualType EltTy = RetVT->getElementType(); + // QualType EltTy = RetVT->getElementType(); unsigned RetLen = RetVT->getNumElements(); - if (!SourceVec.isVector()) - return false; - unsigned SrcLen = SourceVec.getVectorLength(); - if (SrcLen % RetLen != 0) - return false; - - unsigned NumLanes = SrcLen / RetLen; - unsigned idx = SourceImm.getInt().getZExtValue() & (NumLanes - 1); - - uint64_t KmaskBits = SourceKmask.getInt().getZExtValue(); - - auto makeZeroInt = [&]() -> APValue { - bool Uns = EltTy->isUnsignedIntegerOrEnumerationType(); - unsigned BW = Info.Ctx.getIntWidth(EltTy); - return APValue(APSInt(APInt(BW, 0), Uns)); - }; + if (!A.isVector() || !W.isVector()) return false; + unsigned SrcLen = A.getVectorLength(); + if (!SrcLen || !RetLen || (SrcLen % RetLen) != 0) return false; + + unsigned lanes = SrcLen / RetLen; + unsigned lane = static_cast(Imm.getZExtValue() % lanes); + unsigned base = lane * RetLen; + uint64_t K = U.getZExtValue(); SmallVector ResultElements; ResultElements.reserve(RetLen); - for (unsigned i = 0; i < RetLen; i++) { - bool Take = (KmaskBits >> i) & 1; - if (Take) { - ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); - } else { - - const APValue &MergeElt = - SourceMerge.isVector() ? SourceMerge.getVectorElt(i) : makeZeroInt(); - ResultElements.push_back(MergeElt); - } + for (unsigned i = 0; i < RetLen; ++i) { + if ((K >> i) & 1) + ResultElements.push_back(A.getVectorElt(base + i)); + else + ResultElements.push_back(W.getVectorElt(i)); // maskz/unmasked 모두 헤더에서 맞춰줌 } - return Success(APValue(ResultElements.data(), RetLen), E); + return Success(APValue(ResultElements.data(), ResultElements.size()), E); } diff --git a/clang/test/CodeGen/X86/avx512dq-builtins.c b/clang/test/CodeGen/X86/avx512dq-builtins.c index e9f344b240329..f6ff1828cb41d 100644 --- a/clang/test/CodeGen/X86/avx512dq-builtins.c +++ b/clang/test/CodeGen/X86/avx512dq-builtins.c @@ -1402,11 +1402,11 @@ __m256 test_mm512_extractf32x8_ps(__m512 __A) { // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <8 x i32> return _mm512_extractf32x8_ps(__A, 1); } -// TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){ -// 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f, -// 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f -// }), 1), -// 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); +TEST_CONSTEXPR(match_m256(_mm512_extractf32x8_ps(((__m512){ + 0.0f,1.0f,2.0f,3.0f, 4.0f,5.0f,6.0f,7.0f, + 8.0f,9.0f,10.0f,11.0f, 12.0f,13.0f,14.0f,15.0f + }), 1), + 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f)); __m256 test_mm512_mask_extractf32x8_ps(__m256 __W, __mmask8 __U, __m512 __A) { // CHECK-LABEL: test_mm512_mask_extractf32x8_ps @@ -1444,9 +1444,9 @@ __m128d test_mm512_extractf64x2_pd(__m512d __A) { // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <2 x i32> return _mm512_extractf64x2_pd(__A, 3); } -// TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd( -// ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3), -// 6.0, 7.0)); +TEST_CONSTEXPR(match_m128d(_mm512_extractf64x2_pd( + ((__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}), 3), + 6.0, 7.0)); __m128d test_mm512_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m512d __A) { // CHECK-LABEL: test_mm512_mask_extractf64x2_pd @@ -1478,9 +1478,9 @@ __m256i test_mm512_extracti32x8_epi32(__m512i __A) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <8 x i32> return _mm512_extracti32x8_epi32(__A, 1); } -// TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32( -// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), -// 8, 9,10,11,12,13,14,15)); +TEST_CONSTEXPR(match_v8si(_mm512_extracti32x8_epi32( + ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), + 8, 9,10,11,12,13,14,15)); __m256i test_mm512_mask_extracti32x8_epi32(__m256i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti32x8_epi32 @@ -1501,20 +1501,20 @@ __m256i test_mm512_maskz_extracti32x8_epi32(__mmask8 __U, __m512i __A) { // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}} return _mm512_maskz_extracti32x8_epi32(__U, __A, 1); } -// TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32( -// (__mmask8)0x0F, -// ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), -// 1), -// 8, 9, 10, 11, 0, 0, 0, 0)); +TEST_CONSTEXPR(match_v8si(_mm512_maskz_extracti32x8_epi32( + (__mmask8)0x0F, + ((__m512i)(__v16si){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), + 1), + 8, 9, 10, 11, 0, 0, 0, 0)); __m128i test_mm512_extracti64x2_epi64(__m512i __A) { // CHECK-LABEL: test_mm512_extracti64x2_epi64 // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <2 x i32> return _mm512_extracti64x2_epi64(__A, 3); } -// TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64( -// ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), -// 6ULL, 7ULL)); +TEST_CONSTEXPR(match_m128i(_mm512_extracti64x2_epi64( + ((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}), 3), + 6ULL, 7ULL)); __m128i test_mm512_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti64x2_epi64 From 05d6d8b72f982d16e10e93bcbedfd627471385e2 Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Mon, 29 Sep 2025 23:51:58 +0900 Subject: [PATCH 16/21] Add _extracti64x4_mask --- clang/lib/AST/ExprConstant.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 327265b79d101..ae59da87a3ccd 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12070,6 +12070,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_extractf64x2_256_mask: case X86::BI__builtin_ia32_extracti64x2_512_mask: case X86::BI__builtin_ia32_extractf64x2_512_mask: + case X86::BI__builtin_ia32_extracti64x4_mask: case X86::BI__builtin_ia32_extractf64x4_mask:{ APValue A, W; APSInt Imm, U; From b9d0cdbb15385878f7ae052debc63493732cacff Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Mon, 29 Sep 2025 23:52:50 +0900 Subject: [PATCH 17/21] refactoring and add test code --- clang/lib/Headers/avx512fintrin.h | 8 +-- clang/test/CodeGen/X86/avx512f-builtins.c | 87 ++++++++++++----------- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h index 80e58425cdd71..2768a5bae887d 100644 --- a/clang/lib/Headers/avx512fintrin.h +++ b/clang/lib/Headers/avx512fintrin.h @@ -3166,7 +3166,7 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, #define _mm512_extractf64x4_pd(A, I) \ ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ - (__v4df)_mm256_undefined_pd(), \ + (__v4df)_mm256_setzero_pd(), \ (__mmask8)-1)) #define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ @@ -3181,7 +3181,7 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, #define _mm512_extractf32x4_ps(A, I) \ ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v4sf)_mm_undefined_ps(), \ + (__v4sf)_mm_setzero_ps(), \ (__mmask8)-1)) #define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ @@ -7107,7 +7107,7 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) #define _mm512_extracti32x4_epi32(A, imm) \ ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ + (__v4si)_mm_setzero_si128(), \ (__mmask8)-1)) #define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ @@ -7122,7 +7122,7 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) #define _mm512_extracti64x4_epi64(A, imm) \ ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_undefined_si256(), \ + (__v4di)_mm256_setzero_si256(), \ (__mmask8)-1)) #define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c index d37b22285174e..7271e200bcaec 100644 --- a/clang/test/CodeGen/X86/avx512f-builtins.c +++ b/clang/test/CodeGen/X86/avx512f-builtins.c @@ -2452,11 +2452,9 @@ __m256d test_mm512_extractf64x4_pd(__m512d a) // CHECK: shufflevector <8 x double> %{{.*}}, <8 x double> poison, <4 x i32> return _mm512_extractf64x4_pd(a, 1); } -TEST_CONSTEXPR(match_m256d(_mm512_extractf64x4_pd(((__m512d){ - 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 - }), 1), - 4.0, 5.0, 6.0, 7.0)); - +TEST_CONSTEXPR(match_m256d(_mm512_extractf64x4_pd(((__m512d) +{0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}),1), + 4.0, 5.0, 6.0, 7.0)); __m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){ // CHECK-LABEL: test_mm512_mask_extractf64x4_pd @@ -2464,12 +2462,13 @@ __m256d test_mm512_mask_extractf64x4_pd(__m256d __W,__mmask8 __U,__m512d __A){ // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm512_mask_extractf64x4_pd( __W, __U, __A, 1); } -TEST_CONSTEXPR(match_m256d(_mm512_mask_extractf64x4_pd( - (__m256d){100.0,101.0,102.0,103.0}, // W(merge) - (__mmask8)0x5, // 0101b - (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, - 1), - 4.0, 101.0, 6.0, 103.0)); +TEST_CONSTEXPR(match_m256d( + _mm512_mask_extractf64x4_pd( + ((__m256d){100.0,101.0,102.0,103.0}), // W (merge) + (__mmask8)0x5, + ((__m512d){0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}), + 1), + 4.0, 101.0, 6.0, 103.0)); __m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){ // CHECK-LABEL: test_mm512_maskz_extractf64x4_pd @@ -2477,11 +2476,12 @@ __m256d test_mm512_maskz_extractf64x4_pd(__mmask8 __U,__m512d __A){ // CHECK: select <4 x i1> %{{.*}}, <4 x double> %{{.*}}, <4 x double> %{{.*}} return _mm512_maskz_extractf64x4_pd( __U, __A, 1); } -TEST_CONSTEXPR(match_m256d(_mm512_maskz_extractf64x4_pd( - (__mmask8)0x3, - (__m512d){0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0}, - 1), - 4.0, 5.0, 0.0, 0.0)); +TEST_CONSTEXPR(match_m256d( + _mm512_maskz_extractf64x4_pd( + (__mmask8)0x3, + ((__m512d){0.0,1.0,2.0,3.0, 4.0,5.0,6.0,7.0}), + 1), + 4.0, 5.0, 0.0, 0.0)); __m128 test_mm512_extractf32x4_ps(__m512 a) { @@ -2489,9 +2489,9 @@ __m128 test_mm512_extractf32x4_ps(__m512 a) // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> poison, <4 x i32> return _mm512_extractf32x4_ps(a, 1); } -TEST_CONSTEXPR(match_m128(_mm512_extractf32x4_ps(((__m512){ - 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 - }), 1), +TEST_CONSTEXPR(match_m128(_mm512_extractf32x4_ps( + ((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), + 1), 4.0f, 5.0f, 6.0f, 7.0f)); __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){ @@ -2501,9 +2501,9 @@ __m128 test_mm512_mask_extractf32x4_ps(__m128 __W, __mmask8 __U,__m512 __A){ return _mm512_mask_extractf32x4_ps( __W, __U, __A, 1); } TEST_CONSTEXPR(match_m128(_mm512_mask_extractf32x4_ps( - (__m128){100,101,102,103}, // W(merge) - (__mmask8)0x5, // 0101b - (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}, + ((__m128){100,101,102,103}), + (__mmask8)0x5, + ((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), 4.0f, 101.0f, 6.0f, 103.0f)); @@ -2515,7 +2515,7 @@ __m128 test_mm512_maskz_extractf32x4_ps( __mmask8 __U,__m512 __A){ } TEST_CONSTEXPR(match_m128(_mm512_maskz_extractf32x4_ps( (__mmask8)0x3, - (__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}, + ((__m512){0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 1), 4.0f, 5.0f, 0.0f, 0.0f)); @@ -7388,10 +7388,11 @@ __m128i test_mm512_extracti32x4_epi32(__m512i __A) { // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> poison, <4 x i32> return _mm512_extracti32x4_epi32(__A, 3); } -TEST_CONSTEXPR(match_m128i(_mm512_extracti32x4_epi32(((__m512i){ - 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 - }), 3), - 12, 13, 14, 15)); +TEST_CONSTEXPR(match_m128i(_mm512_extracti32x4_epi32(((__m512i)(__v16si) + {0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 3), + 0x0000000D0000000CULL, // (13<<32)|12 + 0x0000000F0000000EULL + )); __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_mask_extracti32x4_epi32 @@ -7400,14 +7401,16 @@ __m128i test_mm512_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m512i __ return _mm512_mask_extracti32x4_epi32(__W, __U, __A, 3); } TEST_CONSTEXPR(match_m128i(_mm512_mask_extracti32x4_epi32( - (__m128i){100,101,102,103}, // merge=W + ((__m128i)(__v4si){100,101,102,103}), // merge=W (__mmask8)0x5, // 0101b - (__m512i){ + ((__m512i)(__v16si){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15 - }, + }), 3), - 12, 101, 14, 103)); + 0x000000650000000CULL, // (101<<32)|12 + 0x000000670000000EULL // (103<<32)|14 + )); __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) { // CHECK-LABEL: test_mm512_maskz_extracti32x4_epi32 @@ -7417,21 +7420,19 @@ __m128i test_mm512_maskz_extracti32x4_epi32(__mmask8 __U, __m512i __A) { } TEST_CONSTEXPR(match_m128i(_mm512_maskz_extracti32x4_epi32( (__mmask8)0x3, - (__m512i){ - 0,1,2,3, 4,5,6,7, - 8,9,10,11, 12,13,14,15 - }, + ((__m512i)(__v16si){0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}), 3), -12, 13, 0, 0)); + 0x0000000D0000000CULL, // (13<<32)|12 + 0x0000000000000000ULL + )); __m256i test_mm512_extracti64x4_epi64(__m512i __A) { // CHECK-LABEL: test_mm512_extracti64x4_epi64 // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> poison, <4 x i32> return _mm512_extracti64x4_epi64(__A, 1); } -TEST_CONSTEXPR(match_m256i(_mm512_extracti64x4_epi64(((__m512i){ - 0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL - }), 1), +TEST_CONSTEXPR(match_m256i( + _mm512_extracti64x4_epi64(((__m512i)(__v8di){0,1,2,3,4,5,6,7}), 1), 4ULL, 5ULL, 6ULL, 7ULL)); __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __A) { @@ -7440,10 +7441,10 @@ __m256i test_mm512_mask_extracti64x4_epi64(__m256i __W, __mmask8 __U, __m512i __ // CHECK: select <4 x i1> %{{.*}}, <4 x i64> %{{.*}}, <4 x i64> %{{.*}} return _mm512_mask_extracti64x4_epi64(__W, __U, __A, 1); } -TEST_CONSTEXPR(match_m256i_64(_mm512_mask_extracti64x4_epi64( - (__m256i){100ULL,101ULL,102ULL,103ULL}, +TEST_CONSTEXPR(match_m256i(_mm512_mask_extracti64x4_epi64( + ((__m256i)(__v4di){100ULL,101ULL,102ULL,103ULL}), // W (__mmask8)0x5, - (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, + (((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL})), 1), 4ULL, 101ULL, 6ULL, 103ULL)); @@ -7455,7 +7456,7 @@ __m256i test_mm512_maskz_extracti64x4_epi64(__mmask8 __U, __m512i __A) { } TEST_CONSTEXPR(match_m256i(_mm512_maskz_extracti64x4_epi64( (__mmask8)0x3, - (__m512i){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL}, + (((__m512i)(__v8di){0ULL,1ULL,2ULL,3ULL, 4ULL,5ULL,6ULL,7ULL})), 1), 4ULL, 5ULL, 0ULL, 0ULL)); From 86e46dbaf94226966208496a731a0af83f032efa Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Tue, 30 Sep 2025 00:20:27 +0900 Subject: [PATCH 18/21] Add test and refactoring --- clang/lib/Headers/avx512vldqintrin.h | 4 +- clang/lib/Headers/avx512vlintrin.h | 4 +- clang/test/CodeGen/X86/avx512vl-builtins.c | 64 +++++++++++--------- clang/test/CodeGen/X86/avx512vldq-builtins.c | 18 +++--- 4 files changed, 48 insertions(+), 42 deletions(-) diff --git a/clang/lib/Headers/avx512vldqintrin.h b/clang/lib/Headers/avx512vldqintrin.h index 68bd52e43981a..2d3c4b551e3b0 100644 --- a/clang/lib/Headers/avx512vldqintrin.h +++ b/clang/lib/Headers/avx512vldqintrin.h @@ -1075,7 +1075,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) #define _mm256_extractf64x2_pd(A, imm) \ ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ (int)(imm), \ - (__v2df)_mm_undefined_pd(), \ + (__v2df)_mm_setzero_pd(), \ (__mmask8)-1)) #define _mm256_mask_extractf64x2_pd(W, U, A, imm) \ @@ -1093,7 +1093,7 @@ _mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) #define _mm256_extracti64x2_epi64(A, imm) \ ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ (int)(imm), \ - (__v2di)_mm_undefined_si128(), \ + (__v2di)_mm_setzero_si128(), \ (__mmask8)-1)) #define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \ diff --git a/clang/lib/Headers/avx512vlintrin.h b/clang/lib/Headers/avx512vlintrin.h index 965741f0ff944..252fb111988b0 100644 --- a/clang/lib/Headers/avx512vlintrin.h +++ b/clang/lib/Headers/avx512vlintrin.h @@ -7609,7 +7609,7 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) #define _mm256_extractf32x4_ps(A, imm) \ ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ (int)(imm), \ - (__v4sf)_mm_undefined_ps(), \ + (__v4sf)_mm_setzero_ps(), \ (__mmask8)-1)) #define _mm256_mask_extractf32x4_ps(W, U, A, imm) \ @@ -7627,7 +7627,7 @@ _mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) #define _mm256_extracti32x4_epi32(A, imm) \ ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ + (__v4si)_mm_setzero_si128(), \ (__mmask8)-1)) #define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \ diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c index 323ac1b2cab63..4e2a31a26868a 100644 --- a/clang/test/CodeGen/X86/avx512vl-builtins.c +++ b/clang/test/CodeGen/X86/avx512vl-builtins.c @@ -9875,9 +9875,8 @@ __m128 test_mm256_extractf32x4_ps(__m256 __A) { // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> poison, <4 x i32> return _mm256_extractf32x4_ps(__A, 1); } -TEST_CONSTEXPR(match_m128(_mm256_extractf32x4_ps(((__m256){ - 0,1,2,3, 4,5,6,7 - }), 1), +TEST_CONSTEXPR(match_m128( + _mm256_extractf32x4_ps(((__m256){0,1,2,3, 4,5,6,7}), 1), 4.0f, 5.0f, 6.0f, 7.0f)); __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) { @@ -9886,12 +9885,13 @@ __m128 test_mm256_mask_extractf32x4_ps(__m128 __W, __mmask8 __U, __m256 __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm256_mask_extractf32x4_ps(__W, __U, __A, 1); } -TEST_CONSTEXPR( match_m128(_mm256_mask_extractf32x4_ps( - (__m128){100,101,102,103}, // W (merge) - (__mmask8)0x5, // 0101b - (__m256){0,1,2,3, 4,5,6,7}, - 1), - 4.0f, 101.0f, 6.0f, 103.0f)); +TEST_CONSTEXPR(match_m128( + _mm256_mask_extractf32x4_ps( + (((__m128){100,101,102,103})), + (__mmask8)0x5, + (((__m256){0,1,2,3, 4,5,6,7})), + 1), + 4.0f, 101.0f, 6.0f, 103.0f)); __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) { // CHECK-LABEL: test_mm256_maskz_extractf32x4_ps @@ -9899,21 +9899,23 @@ __m128 test_mm256_maskz_extractf32x4_ps(__mmask8 __U, __m256 __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}} return _mm256_maskz_extractf32x4_ps(__U, __A, 1); } -TEST_CONSTEXPR(match_m128(_mm256_maskz_extractf32x4_ps( - (__mmask8)0x3, - (__m256){0,1,2,3, 4,5,6,7}, - 1), - 4.0f, 5.0f, 0.0f, 0.0f)); +TEST_CONSTEXPR(match_m128( + _mm256_maskz_extractf32x4_ps( + (__mmask8)0x3, + (((__m256){0,1,2,3, 4,5,6,7})), + 1), + 4.0f, 5.0f, 0.0f, 0.0f)); __m128i test_mm256_extracti32x4_epi32(__m256i __A) { // CHECK-LABEL: test_mm256_extracti32x4_epi32 // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> poison, <4 x i32> return _mm256_extracti32x4_epi32(__A, 1); } -TEST_CONSTEXPR(match_m128i(_mm256_extracti32x4_epi32(((__m256i){ - 0,1,2,3, 4,5,6,7 - }), 1), - 4, 5, 6, 7)); +TEST_CONSTEXPR(match_m128i( + _mm256_extracti32x4_epi32( + (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})), 1), + 0x0000000500000004ULL, + 0x0000000700000006ULL)); __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_mask_extracti32x4_epi32 @@ -9921,12 +9923,14 @@ __m128i test_mm256_mask_extracti32x4_epi32(__m128i __W, __mmask8 __U, __m256i __ // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm256_mask_extracti32x4_epi32(__W, __U, __A, 1); } -TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti32x4_epi32( - (__m128i){100,101,102,103}, // W (merge) - (__mmask8)0xA, // 1010b - (__m256i){0,1,2,3, 4,5,6,7}, - 1), - 100, 5, 102, 7)); +TEST_CONSTEXPR(match_m128i( + _mm256_mask_extracti32x4_epi32( + (((__m128i)(__v4si){100,101,102,103})), + (__mmask8)0xA, + (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})), + 1), + 0x0000000500000064ULL, + 0x0000000700000066ULL)); __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) { // CHECK-LABEL: test_mm256_maskz_extracti32x4_epi32 @@ -9934,11 +9938,13 @@ __m128i test_mm256_maskz_extracti32x4_epi32(__mmask8 __U, __m256i __A) { // CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}} return _mm256_maskz_extracti32x4_epi32(__U, __A, 1); } -TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti32x4_epi32( - (__mmask8)0x3, - (__m256i){0,1,2,3, 4,5,6,7}, - 1), - 4, 5, 0, 0)); +TEST_CONSTEXPR(match_m128i( + _mm256_maskz_extracti32x4_epi32( + (__mmask8)0x3, + (((__m256i)(__v8si){0,1,2,3, 4,5,6,7})), + 1), + 0x0000000500000004ULL, + 0x0000000000000000ULL)); __m256 test_mm256_insertf32x4(__m256 __A, __m128 __B) { // CHECK-LABEL: test_mm256_insertf32x4 diff --git a/clang/test/CodeGen/X86/avx512vldq-builtins.c b/clang/test/CodeGen/X86/avx512vldq-builtins.c index 9cfcfea3dafc7..d566363d1f291 100644 --- a/clang/test/CodeGen/X86/avx512vldq-builtins.c +++ b/clang/test/CodeGen/X86/avx512vldq-builtins.c @@ -1093,9 +1093,9 @@ __m128d test_mm256_mask_extractf64x2_pd(__m128d __W, __mmask8 __U, __m256d __A) return _mm256_mask_extractf64x2_pd(__W, __U, __A, 1); } TEST_CONSTEXPR(match_m128d(_mm256_mask_extractf64x2_pd( - (__m128d){100.0, 101.0}, // W(merge) + (((__m128d){100.0, 101.0})), // W(merge) (__mmask8)0x1, - (__m256d){0.0,1.0,2.0,3.0}, + (((__m256d){0.0,1.0,2.0,3.0})), 1), 2.0, 101.0)); @@ -1107,7 +1107,7 @@ __m128d test_mm256_maskz_extractf64x2_pd(__mmask8 __U, __m256d __A) { } TEST_CONSTEXPR(match_m128d(_mm256_maskz_extractf64x2_pd( (__mmask8)0x2, - (__m256d){0.0,1.0,2.0,3.0}, + (((__m256d){0.0,1.0,2.0,3.0})), 1), 0.0, 3.0)); @@ -1116,7 +1116,7 @@ __m128i test_mm256_extracti64x2_epi64(__m256i __A) { // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> poison, <2 x i32> return _mm256_extracti64x2_epi64(__A, 1); } -TEST_CONSTEXPR(match_m128i_64(_mm256_extracti64x2_epi64(((__m256i){0ULL,1ULL,2ULL,3ULL}), 1), +TEST_CONSTEXPR(match_m128i(_mm256_extracti64x2_epi64(((__m256i){0ULL,1ULL,2ULL,3ULL}), 1), 2ULL, 3ULL)); __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __A) { @@ -1125,10 +1125,10 @@ __m128i test_mm256_mask_extracti64x2_epi64(__m128i __W, __mmask8 __U, __m256i __ // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm256_mask_extracti64x2_epi64(__W, __U, __A, 1); } -TEST_CONSTEXPR(match_m128i_64(_mm256_mask_extracti64x2_epi64( - (__m128i){100ULL, 101ULL}, // W(merge) +TEST_CONSTEXPR(match_m128i(_mm256_mask_extracti64x2_epi64( + (((__m128i){100ULL, 101ULL})), // W(merge) (__mmask8)0x1, - (__m256i){0ULL,1ULL,2ULL,3ULL}, + (((__m256i){0ULL,1ULL,2ULL,3ULL})), 1), 2ULL, 101ULL)); @@ -1138,9 +1138,9 @@ __m128i test_mm256_maskz_extracti64x2_epi64(__mmask8 __U, __m256i __A) { // CHECK: select <2 x i1> %{{.*}}, <2 x i64> %{{.*}}, <2 x i64> %{{.*}} return _mm256_maskz_extracti64x2_epi64(__U, __A, 1); } -TEST_CONSTEXPR(match_m128i_64(_mm256_maskz_extracti64x2_epi64( +TEST_CONSTEXPR(match_m128i(_mm256_maskz_extracti64x2_epi64( (__mmask8)0x2, - (__m256i){0ULL,1ULL,2ULL,3ULL}, + (((__m256i){0ULL,1ULL,2ULL,3ULL})), 1), 0ULL, 3ULL)); From 93b6fb3f5d74f5bfde70bf1c6f9a3a272599bf3f Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Mon, 6 Oct 2025 19:30:04 +0900 Subject: [PATCH 19/21] Remove comment --- clang/lib/AST/ExprConstant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index ae59da87a3ccd..60d819c319084 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12100,7 +12100,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if ((K >> i) & 1) ResultElements.push_back(A.getVectorElt(base + i)); else - ResultElements.push_back(W.getVectorElt(i)); // maskz/unmasked 모두 헤더에서 맞춰줌 + ResultElements.push_back(W.getVectorElt(i)); } return Success(APValue(ResultElements.data(), ResultElements.size()), E); } From ce4577a23917068abcd9ac4acdda9fef06f5af8e Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Thu, 9 Oct 2025 01:53:26 +0900 Subject: [PATCH 20/21] Refactor review comments and remove unrelated files --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 45 +++++++++++------------- clang/lib/AST/ExprConstant.cpp | 39 ++++++++++---------- 2 files changed, 39 insertions(+), 45 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 05ef09b3cbaee..8bed87f027712 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -2855,16 +2855,13 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC, unsigned ID) { assert(Call->getNumArgs() == 2); - // srcimm APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); uint64_t Index = ImmAPS.getZExtValue(); - // srcvec const Pointer &Src = S.Stk.pop(); if (!Src.getFieldDesc()->isPrimitiveArray()) return false; - // destination (return value) const Pointer &Dst = S.Stk.peek(); if (!Dst.getFieldDesc()->isPrimitiveArray()) return false; @@ -2879,12 +2876,11 @@ static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC, unsigned Lane = static_cast(Index % NumLanes); unsigned ExtractPos = Lane * DstElems; - // element type - PrimType ElemPT = Src.getFieldDesc()->getPrimType(); - if (ElemPT != Dst.getFieldDesc()->getPrimType()) + PrimType ElemT = Src.getFieldDesc()->getPrimType(); + if (ElemT != Dst.getFieldDesc()->getPrimType()) return false; - TYPE_SWITCH(ElemPT, { + TYPE_SWITCH(ElemT, { for (unsigned I = 0; I != DstElems; ++I) { Dst.elem(I) = Src.elem(ExtractPos + I); } @@ -2899,41 +2895,40 @@ static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr Op unsigned ID) { assert(Call->getNumArgs() == 4); - APSInt UAPS = popToAPSInt(S, Call->getArg(3)); - const Pointer &W = S.Stk.pop(); + APSInt MaskAPS = popToAPSInt(S, Call->getArg(3)); + const Pointer &Merge = S.Stk.pop(); APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); - const Pointer &A = S.Stk.pop(); + const Pointer &Src = S.Stk.pop(); - if (!A.getFieldDesc()->isPrimitiveArray() || !W.getFieldDesc()->isPrimitiveArray()) + if (!Src.getFieldDesc()->isPrimitiveArray() || !Merge.getFieldDesc()->isPrimitiveArray()) return false; const Pointer &Dst = S.Stk.peek(); if (!Dst.getFieldDesc()->isPrimitiveArray()) return false; - unsigned SrcElems = A.getNumElems(); + unsigned SrcElems = Src.getNumElems(); unsigned DstElems = Dst.getNumElems(); if (!SrcElems || !DstElems || (SrcElems % DstElems) != 0) return false; - // 타입 일치 체크 - PrimType PT = A.getFieldDesc()->getPrimType(); - if (PT != Dst.getFieldDesc()->getPrimType() || - PT != W.getFieldDesc()->getPrimType()) + PrimType ElemT = Src.getFieldDesc()->getPrimType(); + if (ElemT != Dst.getFieldDesc()->getPrimType() || + ElemT != Merge.getFieldDesc()->getPrimType()) return false; - unsigned numLanes = SrcElems / DstElems; - unsigned lane = static_cast(ImmAPS.getZExtValue() % numLanes); - unsigned base = lane * DstElems; + unsigned NumLanes = SrcElems / DstElems; + unsigned Lane = static_cast(ImmAPS.getZExtValue() % NumLanes); + unsigned Base = Lane * DstElems; - uint64_t U = UAPS.getZExtValue(); + uint64_t Mask = MaskAPS.getZExtValue(); - TYPE_SWITCH(PT, { - for (unsigned i = 0; i < DstElems; ++i) { - if ((U >> i) & 1) - Dst.elem(i) = A.elem(base + i); + TYPE_SWITCH(ElemT, { + for (unsigned I = 0; I < DstElems; ++I) { + if ((Mask >> I) & 1) + Dst.elem(I) = Src.elem(Base + I); else - Dst.elem(i) = W.elem(i); + Dst.elem(I) = Merge.elem(I); } }); diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 60d819c319084..25ded19554518 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12049,13 +12049,13 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (SrcLen != RetLen * 2) return false; - unsigned idx = SourceImm.getInt().getZExtValue() & 1; + unsigned Idx = SourceImm.getInt().getZExtValue() & 1; SmallVector ResultElements; ResultElements.reserve(RetLen); - for (unsigned i = 0; i < RetLen; i++) - ResultElements.push_back(SourceVec.getVectorElt(idx * RetLen + i)); + for (unsigned I = 0; I < RetLen; I++) + ResultElements.push_back(SourceVec.getVectorElt(Idx * RetLen + I)); return Success(APValue(ResultElements.data(), RetLen), E); } @@ -12072,35 +12072,34 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { case X86::BI__builtin_ia32_extractf64x2_512_mask: case X86::BI__builtin_ia32_extracti64x4_mask: case X86::BI__builtin_ia32_extractf64x4_mask:{ - APValue A, W; - APSInt Imm, U; + APValue SourceVec, MergeVec; + APSInt Imm, MaskImm; - if (!EvaluateAsRValue(Info, E->getArg(0), A) || // A - !EvaluateInteger(E->getArg(1), Imm, Info) || // imm - !EvaluateAsRValue(Info, E->getArg(2), W) || // W (merge) - !EvaluateInteger(E->getArg(3), U, Info)) // U (mask) + if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) || + !EvaluateInteger(E->getArg(1), Imm, Info) || + !EvaluateAsRValue(Info, E->getArg(2), MergeVec) || + !EvaluateInteger(E->getArg(3), MaskImm, Info)) return false; const auto *RetVT = E->getType()->castAs(); - // QualType EltTy = RetVT->getElementType(); unsigned RetLen = RetVT->getNumElements(); - if (!A.isVector() || !W.isVector()) return false; - unsigned SrcLen = A.getVectorLength(); + if (!SourceVec.isVector() || !MergeVec.isVector()) return false; + unsigned SrcLen = SourceVec.getVectorLength(); if (!SrcLen || !RetLen || (SrcLen % RetLen) != 0) return false; - unsigned lanes = SrcLen / RetLen; - unsigned lane = static_cast(Imm.getZExtValue() % lanes); - unsigned base = lane * RetLen; - uint64_t K = U.getZExtValue(); + unsigned Lanes = SrcLen / RetLen; + unsigned Lane = static_cast(Imm.getZExtValue() % Lanes); + unsigned Base = Lane * RetLen; + uint64_t Mask = MaskImm.getZExtValue(); SmallVector ResultElements; ResultElements.reserve(RetLen); - for (unsigned i = 0; i < RetLen; ++i) { - if ((K >> i) & 1) - ResultElements.push_back(A.getVectorElt(base + i)); + for (unsigned I = 0; I < RetLen; ++I) { + if ((Mask >> I) & 1) + ResultElements.push_back(SourceVec.getVectorElt(Base + I)); else - ResultElements.push_back(W.getVectorElt(i)); + ResultElements.push_back(MergeVec.getVectorElt(I)); } return Success(APValue(ResultElements.data(), ResultElements.size()), E); } From 24e06be02a1133e834c54e4aac813b274c732dc4 Mon Sep 17 00:00:00 2001 From: SeongjaeP Date: Thu, 9 Oct 2025 14:25:24 +0900 Subject: [PATCH 21/21] Apply style fixes and rebase onto upstream --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 314 +++++------------------ clang/lib/AST/ExprConstant.cpp | 6 +- 2 files changed, 71 insertions(+), 249 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index 8bed87f027712..c8479b9b09a17 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -678,6 +678,30 @@ static bool interp__builtin_popcount(InterpState &S, CodePtr OpPC, return true; } +static bool interp__builtin_parity(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const CallExpr *Call) { + APSInt Val = popToAPSInt(S, Call->getArg(0)); + pushInteger(S, Val.popcount() % 2, Call->getType()); + return true; +} + +static bool interp__builtin_clrsb(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const CallExpr *Call) { + APSInt Val = popToAPSInt(S, Call->getArg(0)); + pushInteger(S, Val.getBitWidth() - Val.getSignificantBits(), Call->getType()); + return true; +} + +static bool interp__builtin_bitreverse(InterpState &S, CodePtr OpPC, + const InterpFrame *Frame, + const CallExpr *Call) { + APSInt Val = popToAPSInt(S, Call->getArg(0)); + pushInteger(S, Val.reverseBits(), Call->getType()); + return true; +} + static bool interp__builtin_classify_type(InterpState &S, CodePtr OpPC, const InterpFrame *Frame, const CallExpr *Call) { @@ -2310,14 +2334,10 @@ static bool interp__builtin_object_size(InterpState &S, CodePtr OpPC, if (Ptr.isBaseClass()) ByteOffset = computePointerOffset(ASTCtx, Ptr.getBase()) - computePointerOffset(ASTCtx, Ptr); - else { - if (Ptr.inArray()) - ByteOffset = - computePointerOffset(ASTCtx, Ptr) - - computePointerOffset(ASTCtx, Ptr.expand().atIndex(0).narrow()); - else - ByteOffset = 0; - } + else + ByteOffset = + computePointerOffset(ASTCtx, Ptr) - + computePointerOffset(ASTCtx, Ptr.expand().atIndex(0).narrow()); } else ByteOffset = computePointerOffset(ASTCtx, Ptr); @@ -2579,11 +2599,9 @@ static bool interp__builtin_elementwise_maxmin(InterpState &S, CodePtr OpPC, return true; } -static bool interp__builtin_ia32_pmul( - InterpState &S, CodePtr OpPC, const CallExpr *Call, - llvm::function_ref - Fn) { +static bool interp__builtin_ia32_pmul(InterpState &S, CodePtr OpPC, + const CallExpr *Call, + unsigned BuiltinID) { assert(Call->getArg(0)->getType()->isVectorType() && Call->getArg(1)->getType()->isVectorType()); const Pointer &RHS = S.Stk.pop(); @@ -2592,23 +2610,35 @@ static bool interp__builtin_ia32_pmul( const auto *VT = Call->getArg(0)->getType()->castAs(); PrimType ElemT = *S.getContext().classify(VT->getElementType()); - unsigned NumElems = VT->getNumElements(); - const auto *DestVT = Call->getType()->castAs(); - PrimType DestElemT = *S.getContext().classify(DestVT->getElementType()); - bool DestUnsigned = Call->getType()->isUnsignedIntegerOrEnumerationType(); + unsigned SourceLen = VT->getNumElements(); + PrimType DstElemT = *S.getContext().classify( + Call->getType()->castAs()->getElementType()); unsigned DstElem = 0; - for (unsigned I = 0; I != NumElems; I += 2) { - APSInt Result; + for (unsigned I = 0; I != SourceLen; I += 2) { + APSInt Elem1; + APSInt Elem2; INT_TYPE_SWITCH_NO_BOOL(ElemT, { - APSInt LoLHS = LHS.elem(I).toAPSInt(); - APSInt HiLHS = LHS.elem(I + 1).toAPSInt(); - APSInt LoRHS = RHS.elem(I).toAPSInt(); - APSInt HiRHS = RHS.elem(I + 1).toAPSInt(); - Result = APSInt(Fn(LoLHS, HiLHS, LoRHS, HiRHS), DestUnsigned); + Elem1 = LHS.elem(I).toAPSInt(); + Elem2 = RHS.elem(I).toAPSInt(); }); - INT_TYPE_SWITCH_NO_BOOL(DestElemT, + APSInt Result; + switch (BuiltinID) { + case clang::X86::BI__builtin_ia32_pmuludq128: + case clang::X86::BI__builtin_ia32_pmuludq256: + case clang::X86::BI__builtin_ia32_pmuludq512: + Result = APSInt(llvm::APIntOps::muluExtended(Elem1, Elem2), + /*IsUnsigned=*/true); + break; + case clang::X86::BI__builtin_ia32_pmuldq128: + case clang::X86::BI__builtin_ia32_pmuldq256: + case clang::X86::BI__builtin_ia32_pmuldq512: + Result = APSInt(llvm::APIntOps::mulsExtended(Elem1, Elem2), + /*IsUnsigned=*/false); + break; + } + INT_TYPE_SWITCH_NO_BOOL(DstElemT, { Dst.elem(DstElem) = static_cast(Result); }); ++DstElem; } @@ -2744,48 +2774,6 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC, return true; } -static bool interp__builtin_ia32_pshuf(InterpState &S, CodePtr OpPC, - const CallExpr *Call, bool IsShufHW) { - assert(Call->getNumArgs() == 2 && "masked forms handled via select*"); - APSInt ControlImm = popToAPSInt(S, Call->getArg(1)); - const Pointer &Src = S.Stk.pop(); - const Pointer &Dst = S.Stk.peek(); - - unsigned NumElems = Dst.getNumElems(); - PrimType ElemT = Dst.getFieldDesc()->getPrimType(); - - unsigned ElemBits = static_cast(primSize(ElemT) * 8); - if (ElemBits != 16 && ElemBits != 32) - return false; - - unsigned LaneElts = 128u / ElemBits; - assert(LaneElts && (NumElems % LaneElts == 0)); - - uint8_t Ctl = static_cast(ControlImm.getZExtValue()); - - for (unsigned Idx = 0; Idx != NumElems; Idx++) { - unsigned LaneBase = (Idx / LaneElts) * LaneElts; - unsigned LaneIdx = Idx % LaneElts; - unsigned SrcIdx = Idx; - unsigned Sel = (Ctl >> (2 * (LaneIdx & 0x3))) & 0x3; - if (ElemBits == 32) { - SrcIdx = LaneBase + Sel; - } else { - constexpr unsigned HalfSize = 4; - bool InHigh = LaneIdx >= HalfSize; - if (!IsShufHW && !InHigh) { - SrcIdx = LaneBase + Sel; - } else if (IsShufHW && InHigh) { - SrcIdx = LaneBase + HalfSize + Sel; - } - } - - INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem(Idx) = Src.elem(SrcIdx); }); - } - Dst.initializeAllElements(); - return true; -} - static bool interp__builtin_elementwise_triop( InterpState &S, CodePtr OpPC, const CallExpr *Call, llvm::function_ref @@ -2918,13 +2906,13 @@ static bool interp__builtin_x86_extract_vector_masked(InterpState &S, CodePtr Op return false; unsigned NumLanes = SrcElems / DstElems; - unsigned Lane = static_cast(ImmAPS.getZExtValue() % NumLanes); - unsigned Base = Lane * DstElems; + unsigned Lane = static_cast(ImmAPS.getZExtValue() % NumLanes); + unsigned Base = Lane * DstElems; uint64_t Mask = MaskAPS.getZExtValue(); TYPE_SWITCH(ElemT, { - for (unsigned I = 0; I < DstElems; ++I) { + for (unsigned I = 0; I != DstElems; ++I) { if ((Mask >> I) & 1) Dst.elem(I) = Src.elem(Base + I); else @@ -2975,104 +2963,7 @@ static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC, }); Dst.initializeAllElements(); - return true; -} -static bool interp__builtin_ia32_pternlog(InterpState &S, CodePtr OpPC, - const CallExpr *Call, bool MaskZ) { - assert(Call->getNumArgs() == 5); - - APInt U = popToAPSInt(S, Call->getArg(4)); // Lane mask - APInt Imm = popToAPSInt(S, Call->getArg(3)); // Ternary truth table - const Pointer &C = S.Stk.pop(); - const Pointer &B = S.Stk.pop(); - const Pointer &A = S.Stk.pop(); - const Pointer &Dst = S.Stk.peek(); - - unsigned DstLen = A.getNumElems(); - const QualType ElemQT = getElemType(A); - const OptPrimType ElemPT = S.getContext().classify(ElemQT); - unsigned LaneWidth = S.getASTContext().getTypeSize(ElemQT); - bool DstUnsigned = ElemQT->isUnsignedIntegerOrEnumerationType(); - - INT_TYPE_SWITCH_NO_BOOL(*ElemPT, { - for (unsigned I = 0; I != DstLen; ++I) { - APInt ALane = A.elem(I).toAPSInt(); - APInt BLane = B.elem(I).toAPSInt(); - APInt CLane = C.elem(I).toAPSInt(); - APInt RLane(LaneWidth, 0); - if (U[I]) { // If lane not masked, compute ternary logic. - for (unsigned Bit = 0; Bit != LaneWidth; ++Bit) { - unsigned ABit = ALane[Bit]; - unsigned BBit = BLane[Bit]; - unsigned CBit = CLane[Bit]; - unsigned Idx = (ABit << 2) | (BBit << 1) | (CBit); - RLane.setBitVal(Bit, Imm[Idx]); - } - Dst.elem(I) = static_cast(APSInt(RLane, DstUnsigned)); - } else if (MaskZ) { // If zero masked, zero the lane. - Dst.elem(I) = static_cast(APSInt(RLane, DstUnsigned)); - } else { // Just masked, put in A lane. - Dst.elem(I) = static_cast(APSInt(ALane, DstUnsigned)); - } - } - }); - Dst.initializeAllElements(); - return true; -} - -static bool interp__builtin_vec_ext(InterpState &S, CodePtr OpPC, - const CallExpr *Call, unsigned ID) { - assert(Call->getNumArgs() == 2); - - APSInt ImmAPS = popToAPSInt(S, Call->getArg(1)); - const Pointer &Vec = S.Stk.pop(); - if (!Vec.getFieldDesc()->isPrimitiveArray()) - return false; - - unsigned NumElems = Vec.getNumElems(); - unsigned Index = - static_cast(ImmAPS.getZExtValue() & (NumElems - 1)); - - PrimType ElemPT = Vec.getFieldDesc()->getPrimType(); - // FIXME(#161685): Replace float+int split with a numeric-only type switch - if (ElemPT == PT_Float) { - S.Stk.push(Vec.elem(Index)); - return true; - } - INT_TYPE_SWITCH_NO_BOOL(ElemPT, { - APSInt V = Vec.elem(Index).toAPSInt(); - pushInteger(S, V, Call->getType()); - }); - - return true; -} - -static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC, - const CallExpr *Call, unsigned ID) { - assert(Call->getNumArgs() == 3); - - APSInt ImmAPS = popToAPSInt(S, Call->getArg(2)); - APSInt ValAPS = popToAPSInt(S, Call->getArg(1)); - - const Pointer &Base = S.Stk.pop(); - if (!Base.getFieldDesc()->isPrimitiveArray()) - return false; - - const Pointer &Dst = S.Stk.peek(); - - unsigned NumElems = Base.getNumElems(); - unsigned Index = - static_cast(ImmAPS.getZExtValue() & (NumElems - 1)); - - PrimType ElemPT = Base.getFieldDesc()->getPrimType(); - INT_TYPE_SWITCH_NO_BOOL(ElemPT, { - for (unsigned I = 0; I != NumElems; ++I) - Dst.elem(I) = Base.elem(I); - Dst.elem(Index) = static_cast(ValAPS); - }); - - Dst.initializeAllElements(); return true; } @@ -3232,25 +3123,18 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BI__builtin_parity: case Builtin::BI__builtin_parityl: case Builtin::BI__builtin_parityll: - return interp__builtin_elementwise_int_unaryop( - S, OpPC, Call, [](const APSInt &Val) -> APInt { - return APInt(Val.getBitWidth(), Val.popcount() % 2); - }); + return interp__builtin_parity(S, OpPC, Frame, Call); + case Builtin::BI__builtin_clrsb: case Builtin::BI__builtin_clrsbl: case Builtin::BI__builtin_clrsbll: - return interp__builtin_elementwise_int_unaryop( - S, OpPC, Call, [](const APSInt &Val) -> APInt { - return APInt(Val.getBitWidth(), - Val.getBitWidth() - Val.getSignificantBits()); - }); + return interp__builtin_clrsb(S, OpPC, Frame, Call); + case Builtin::BI__builtin_bitreverse8: case Builtin::BI__builtin_bitreverse16: case Builtin::BI__builtin_bitreverse32: case Builtin::BI__builtin_bitreverse64: - return interp__builtin_elementwise_int_unaryop( - S, OpPC, Call, - [](const APSInt &Val) -> APInt { return Val.reverseBits(); }); + return interp__builtin_bitreverse(S, OpPC, Frame, Call); case Builtin::BI__builtin_classify_type: return interp__builtin_classify_type(S, OpPC, Frame, Call); @@ -3268,10 +3152,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BI_rotl: case Builtin::BI_lrotl: case Builtin::BI_rotl64: - return interp__builtin_elementwise_int_binop( - S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt { - return Value.rotl(Amount); - }); + return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/false); case Builtin::BI__builtin_rotateright8: case Builtin::BI__builtin_rotateright16: @@ -3282,19 +3163,12 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case Builtin::BI_rotr: case Builtin::BI_lrotr: case Builtin::BI_rotr64: - return interp__builtin_elementwise_int_binop( - S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt { - return Value.rotr(Amount); - }); + return interp__builtin_rotate(S, OpPC, Frame, Call, /*Right=*/true); case Builtin::BI__builtin_ffs: case Builtin::BI__builtin_ffsl: case Builtin::BI__builtin_ffsll: - return interp__builtin_elementwise_int_unaryop( - S, OpPC, Call, [](const APSInt &Val) { - return APInt(Val.getBitWidth(), - Val.isZero() ? 0u : Val.countTrailingZeros() + 1u); - }); + return interp__builtin_ffs(S, OpPC, Frame, Call); case Builtin::BIaddressof: case Builtin::BI__addressof: @@ -3604,7 +3478,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_pmaddubsw128: case clang::X86::BI__builtin_ia32_pmaddubsw256: case clang::X86::BI__builtin_ia32_pmaddubsw512: - return interp__builtin_ia32_pmul( + return interp__builtin_ia32_pmadd( S, OpPC, Call, [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS, const APSInt &HiRHS) { @@ -3616,7 +3490,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case clang::X86::BI__builtin_ia32_pmaddwd128: case clang::X86::BI__builtin_ia32_pmaddwd256: case clang::X86::BI__builtin_ia32_pmaddwd512: - return interp__builtin_ia32_pmul( + return interp__builtin_ia32_pmadd( S, OpPC, Call, [](const APSInt &LoLHS, const APSInt &HiLHS, const APSInt &LoRHS, const APSInt &HiRHS) { @@ -3879,21 +3753,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_selectpd_512: return interp__builtin_select(S, OpPC, Call); - case X86::BI__builtin_ia32_pshuflw: - case X86::BI__builtin_ia32_pshuflw256: - case X86::BI__builtin_ia32_pshuflw512: - return interp__builtin_ia32_pshuf(S, OpPC, Call, false); - - case X86::BI__builtin_ia32_pshufhw: - case X86::BI__builtin_ia32_pshufhw256: - case X86::BI__builtin_ia32_pshufhw512: - return interp__builtin_ia32_pshuf(S, OpPC, Call, true); - - case X86::BI__builtin_ia32_pshufd: - case X86::BI__builtin_ia32_pshufd256: - case X86::BI__builtin_ia32_pshufd512: - return interp__builtin_ia32_pshuf(S, OpPC, Call, false); - case X86::BI__builtin_ia32_kandqi: case X86::BI__builtin_ia32_kandhi: case X86::BI__builtin_ia32_kandsi: @@ -3949,20 +3808,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) { return LHS + RHS; }); - case X86::BI__builtin_ia32_pternlogd128_mask: - case X86::BI__builtin_ia32_pternlogd256_mask: - case X86::BI__builtin_ia32_pternlogd512_mask: - case X86::BI__builtin_ia32_pternlogq128_mask: - case X86::BI__builtin_ia32_pternlogq256_mask: - case X86::BI__builtin_ia32_pternlogq512_mask: - return interp__builtin_ia32_pternlog(S, OpPC, Call, /*MaskZ=*/false); - case X86::BI__builtin_ia32_pternlogd128_maskz: - case X86::BI__builtin_ia32_pternlogd256_maskz: - case X86::BI__builtin_ia32_pternlogd512_maskz: - case X86::BI__builtin_ia32_pternlogq128_maskz: - case X86::BI__builtin_ia32_pternlogq256_maskz: - case X86::BI__builtin_ia32_pternlogq512_maskz: - return interp__builtin_ia32_pternlog(S, OpPC, Call, /*MaskZ=*/true); case Builtin::BI__builtin_elementwise_fshl: return interp__builtin_elementwise_triop(S, OpPC, Call, llvm::APIntOps::fshl); @@ -3988,29 +3833,6 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, case X86::BI__builtin_ia32_insert128i256: return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID); - case X86::BI__builtin_ia32_vec_ext_v4hi: - case X86::BI__builtin_ia32_vec_ext_v16qi: - case X86::BI__builtin_ia32_vec_ext_v8hi: - case X86::BI__builtin_ia32_vec_ext_v4si: - case X86::BI__builtin_ia32_vec_ext_v2di: - case X86::BI__builtin_ia32_vec_ext_v32qi: - case X86::BI__builtin_ia32_vec_ext_v16hi: - case X86::BI__builtin_ia32_vec_ext_v8si: - case X86::BI__builtin_ia32_vec_ext_v4di: - case X86::BI__builtin_ia32_vec_ext_v4sf: - return interp__builtin_vec_ext(S, OpPC, Call, BuiltinID); - - case X86::BI__builtin_ia32_vec_set_v4hi: - case X86::BI__builtin_ia32_vec_set_v16qi: - case X86::BI__builtin_ia32_vec_set_v8hi: - case X86::BI__builtin_ia32_vec_set_v4si: - case X86::BI__builtin_ia32_vec_set_v2di: - case X86::BI__builtin_ia32_vec_set_v32qi: - case X86::BI__builtin_ia32_vec_set_v16hi: - case X86::BI__builtin_ia32_vec_set_v8si: - case X86::BI__builtin_ia32_vec_set_v4di: - return interp__builtin_vec_set(S, OpPC, Call, BuiltinID); - default: S.FFDiag(S.Current->getLocation(OpPC), diag::note_invalid_subexpr_in_const_expr) diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index 25ded19554518..281f9a36093ec 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -12089,9 +12089,9 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { if (!SrcLen || !RetLen || (SrcLen % RetLen) != 0) return false; unsigned Lanes = SrcLen / RetLen; - unsigned Lane = static_cast(Imm.getZExtValue() % Lanes); - unsigned Base = Lane * RetLen; - uint64_t Mask = MaskImm.getZExtValue(); + unsigned Lane = static_cast(Imm.getZExtValue() % Lanes); + unsigned Base = Lane * RetLen; + uint64_t Mask = MaskImm.getZExtValue(); SmallVector ResultElements; ResultElements.reserve(RetLen);