Skip to content

Commit 4b81d70

Browse files
SeongjaePLukacma
authored andcommitted
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow AVX/AVX512 subvector extraction intrinsics to be used in constexpr llvm#157712 (llvm#162836)
**This PR supersedes and replaces PR llvm#158853** The original branch diverged too far from the main branch, resulting in significant merge conflicts that were difficult to resolve cleanly. To provide a clean and reviewable history, this new PR was created by cherry-picking the necessary commits onto a fresh branch based on the latest `main`. --- *(Original Description)* This patch enables the use of AVX/AVX512 subvector extraction intrinsics within `constexpr` functions. This is achieved by implementing the evaluation logic for these intrinsics in `VectorExprEvaluator::VisitCallExpr` and `InterpretBuiltin`. The original discussion and review comments can be found in the previous pull request for context: llvm#158853 Fixes llvm#157712
1 parent c5bf570 commit 4b81d70

File tree

13 files changed

+262
-64
lines changed

13 files changed

+262
-64
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -497,9 +497,6 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
497497
def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
498498
def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
499499
def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
500-
def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
501-
def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
502-
def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
503500
def cvtpd2ps256 : X86Builtin<"_Vector<4, float>(_Vector<4, double>)">;
504501
def cvtps2dq256 : X86Builtin<"_Vector<8, int>(_Vector<8, float>)">;
505502
def cvttpd2dq256 : X86Builtin<"_Vector<4, int>(_Vector<4, double>)">;
@@ -520,6 +517,9 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
520517
def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
521518
def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
522519
def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
520+
def vextractf128_pd256 : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int)">;
521+
def vextractf128_ps256 : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int)">;
522+
def vextractf128_si256 : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int)">;
523523
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
524524
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
525525
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
@@ -622,7 +622,6 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
622622
def permvarsf256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
623623
def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
624624
def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
625-
def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
626625
}
627626

628627

@@ -690,6 +689,7 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
690689
def psrlv4si : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
691690
def psllv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
692691
def psrlv2di : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
692+
def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
693693
}
694694

695695
let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
@@ -1091,7 +1091,7 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256
10911091
def alignq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
10921092
}
10931093

1094-
let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
1094+
let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
10951095
def extractf64x4_mask : X86Builtin<"_Vector<4, double>(_Vector<8, double>, _Constant int, _Vector<4, double>, unsigned char)">;
10961096
def extractf32x4_mask : X86Builtin<"_Vector<4, float>(_Vector<16, float>, _Constant int, _Vector<4, float>, unsigned char)">;
10971097
}
@@ -2956,24 +2956,24 @@ let Features = "avx512vl", Attributes = [NoThrow, RequiredVectorWidth<256>] in {
29562956
def pmovqw256mem_mask : X86Builtin<"void(_Vector<8, short *>, _Vector<4, long long int>, unsigned char)">;
29572957
}
29582958

2959-
let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
2959+
let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
29602960
def extractf32x8_mask : X86Builtin<"_Vector<8, float>(_Vector<16, float>, _Constant int, _Vector<8, float>, unsigned char)">;
29612961
def extractf64x2_512_mask : X86Builtin<"_Vector<2, double>(_Vector<8, double>, _Constant int, _Vector<2, double>, unsigned char)">;
29622962
def extracti32x8_mask : X86Builtin<"_Vector<8, int>(_Vector<16, int>, _Constant int, _Vector<8, int>, unsigned char)">;
29632963
def extracti64x2_512_mask : X86Builtin<"_Vector<2, long long int>(_Vector<8, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
29642964
}
29652965

2966-
let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
2966+
let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
29672967
def extracti32x4_mask : X86Builtin<"_Vector<4, int>(_Vector<16, int>, _Constant int, _Vector<4, int>, unsigned char)">;
29682968
def extracti64x4_mask : X86Builtin<"_Vector<4, long long int>(_Vector<8, long long int>, _Constant int, _Vector<4, long long int>, unsigned char)">;
29692969
}
29702970

2971-
let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
2971+
let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
29722972
def extractf64x2_256_mask : X86Builtin<"_Vector<2, double>(_Vector<4, double>, _Constant int, _Vector<2, double>, unsigned char)">;
29732973
def extracti64x2_256_mask : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int, _Vector<2, long long int>, unsigned char)">;
29742974
}
29752975

2976-
let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
2976+
let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
29772977
def extractf32x4_256_mask : X86Builtin<"_Vector<4, float>(_Vector<8, float>, _Constant int, _Vector<4, float>, unsigned char)">;
29782978
def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
29792979
}

clang/lib/AST/ByteCode/InterpBuiltin.cpp

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2991,6 +2991,82 @@ static bool interp__builtin_elementwise_triop(
29912991
return true;
29922992
}
29932993

2994+
static bool interp__builtin_x86_extract_vector(InterpState &S, CodePtr OpPC,
2995+
const CallExpr *Call,
2996+
unsigned ID) {
2997+
assert(Call->getNumArgs() == 2);
2998+
2999+
APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
3000+
uint64_t Index = ImmAPS.getZExtValue();
3001+
3002+
const Pointer &Src = S.Stk.pop<Pointer>();
3003+
if (!Src.getFieldDesc()->isPrimitiveArray())
3004+
return false;
3005+
3006+
const Pointer &Dst = S.Stk.peek<Pointer>();
3007+
if (!Dst.getFieldDesc()->isPrimitiveArray())
3008+
return false;
3009+
3010+
unsigned SrcElems = Src.getNumElems();
3011+
unsigned DstElems = Dst.getNumElems();
3012+
3013+
unsigned NumLanes = SrcElems / DstElems;
3014+
unsigned Lane = static_cast<unsigned>(Index % NumLanes);
3015+
unsigned ExtractPos = Lane * DstElems;
3016+
3017+
PrimType ElemT = Src.getFieldDesc()->getPrimType();
3018+
3019+
TYPE_SWITCH(ElemT, {
3020+
for (unsigned I = 0; I != DstElems; ++I) {
3021+
Dst.elem<T>(I) = Src.elem<T>(ExtractPos + I);
3022+
}
3023+
});
3024+
3025+
Dst.initializeAllElements();
3026+
return true;
3027+
}
3028+
3029+
static bool interp__builtin_x86_extract_vector_masked(InterpState &S,
3030+
CodePtr OpPC,
3031+
const CallExpr *Call,
3032+
unsigned ID) {
3033+
assert(Call->getNumArgs() == 4);
3034+
3035+
APSInt MaskAPS = popToAPSInt(S, Call->getArg(3));
3036+
const Pointer &Merge = S.Stk.pop<Pointer>();
3037+
APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
3038+
const Pointer &Src = S.Stk.pop<Pointer>();
3039+
3040+
if (!Src.getFieldDesc()->isPrimitiveArray() ||
3041+
!Merge.getFieldDesc()->isPrimitiveArray())
3042+
return false;
3043+
3044+
const Pointer &Dst = S.Stk.peek<Pointer>();
3045+
if (!Dst.getFieldDesc()->isPrimitiveArray())
3046+
return false;
3047+
3048+
unsigned SrcElems = Src.getNumElems();
3049+
unsigned DstElems = Dst.getNumElems();
3050+
3051+
unsigned NumLanes = SrcElems / DstElems;
3052+
unsigned Lane = static_cast<unsigned>(ImmAPS.getZExtValue() % NumLanes);
3053+
unsigned Base = Lane * DstElems;
3054+
3055+
PrimType ElemT = Src.getFieldDesc()->getPrimType();
3056+
3057+
TYPE_SWITCH(ElemT, {
3058+
for (unsigned I = 0; I != DstElems; ++I) {
3059+
if (MaskAPS[I])
3060+
Dst.elem<T>(I) = Src.elem<T>(Base + I);
3061+
else
3062+
Dst.elem<T>(I) = Merge.elem<T>(I);
3063+
}
3064+
});
3065+
3066+
Dst.initializeAllElements();
3067+
return true;
3068+
}
3069+
29943070
static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
29953071
const CallExpr *Call,
29963072
unsigned ID) {
@@ -3688,6 +3764,25 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
36883764
S, OpPC, Call, [](const APSInt &LHS, const APSInt &RHS) {
36893765
return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
36903766
});
3767+
case X86::BI__builtin_ia32_extract128i256:
3768+
case X86::BI__builtin_ia32_vextractf128_pd256:
3769+
case X86::BI__builtin_ia32_vextractf128_ps256:
3770+
case X86::BI__builtin_ia32_vextractf128_si256:
3771+
return interp__builtin_x86_extract_vector(S, OpPC, Call, BuiltinID);
3772+
3773+
case X86::BI__builtin_ia32_extractf32x4_256_mask:
3774+
case X86::BI__builtin_ia32_extractf32x4_mask:
3775+
case X86::BI__builtin_ia32_extractf32x8_mask:
3776+
case X86::BI__builtin_ia32_extractf64x2_256_mask:
3777+
case X86::BI__builtin_ia32_extractf64x2_512_mask:
3778+
case X86::BI__builtin_ia32_extractf64x4_mask:
3779+
case X86::BI__builtin_ia32_extracti32x4_256_mask:
3780+
case X86::BI__builtin_ia32_extracti32x4_mask:
3781+
case X86::BI__builtin_ia32_extracti32x8_mask:
3782+
case X86::BI__builtin_ia32_extracti64x2_256_mask:
3783+
case X86::BI__builtin_ia32_extracti64x2_512_mask:
3784+
case X86::BI__builtin_ia32_extracti64x4_mask:
3785+
return interp__builtin_x86_extract_vector_masked(S, OpPC, Call, BuiltinID);
36913786

36923787
case clang::X86::BI__builtin_ia32_pmulhrsw128:
36933788
case clang::X86::BI__builtin_ia32_pmulhrsw256:

clang/lib/AST/ExprConstant.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11811,6 +11811,73 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1181111811
return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
1181211812
});
1181311813

11814+
case X86::BI__builtin_ia32_extract128i256:
11815+
case X86::BI__builtin_ia32_vextractf128_pd256:
11816+
case X86::BI__builtin_ia32_vextractf128_ps256:
11817+
case X86::BI__builtin_ia32_vextractf128_si256: {
11818+
APValue SourceVec, SourceImm;
11819+
if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
11820+
!EvaluateAsRValue(Info, E->getArg(1), SourceImm))
11821+
return false;
11822+
11823+
if (!SourceVec.isVector())
11824+
return false;
11825+
11826+
const auto *RetVT = E->getType()->castAs<VectorType>();
11827+
unsigned RetLen = RetVT->getNumElements();
11828+
unsigned Idx = SourceImm.getInt().getZExtValue() & 1;
11829+
11830+
SmallVector<APValue, 32> ResultElements;
11831+
ResultElements.reserve(RetLen);
11832+
11833+
for (unsigned I = 0; I < RetLen; I++)
11834+
ResultElements.push_back(SourceVec.getVectorElt(Idx * RetLen + I));
11835+
11836+
return Success(APValue(ResultElements.data(), RetLen), E);
11837+
}
11838+
11839+
case X86::BI__builtin_ia32_extracti32x4_256_mask:
11840+
case X86::BI__builtin_ia32_extractf32x4_256_mask:
11841+
case X86::BI__builtin_ia32_extracti32x4_mask:
11842+
case X86::BI__builtin_ia32_extractf32x4_mask:
11843+
case X86::BI__builtin_ia32_extracti32x8_mask:
11844+
case X86::BI__builtin_ia32_extractf32x8_mask:
11845+
case X86::BI__builtin_ia32_extracti64x2_256_mask:
11846+
case X86::BI__builtin_ia32_extractf64x2_256_mask:
11847+
case X86::BI__builtin_ia32_extracti64x2_512_mask:
11848+
case X86::BI__builtin_ia32_extractf64x2_512_mask:
11849+
case X86::BI__builtin_ia32_extracti64x4_mask:
11850+
case X86::BI__builtin_ia32_extractf64x4_mask: {
11851+
APValue SourceVec, MergeVec;
11852+
APSInt Imm, MaskImm;
11853+
11854+
if (!EvaluateAsRValue(Info, E->getArg(0), SourceVec) ||
11855+
!EvaluateInteger(E->getArg(1), Imm, Info) ||
11856+
!EvaluateAsRValue(Info, E->getArg(2), MergeVec) ||
11857+
!EvaluateInteger(E->getArg(3), MaskImm, Info))
11858+
return false;
11859+
11860+
const auto *RetVT = E->getType()->castAs<VectorType>();
11861+
unsigned RetLen = RetVT->getNumElements();
11862+
11863+
if (!SourceVec.isVector() || !MergeVec.isVector())
11864+
return false;
11865+
unsigned SrcLen = SourceVec.getVectorLength();
11866+
unsigned Lanes = SrcLen / RetLen;
11867+
unsigned Lane = static_cast<unsigned>(Imm.getZExtValue() % Lanes);
11868+
unsigned Base = Lane * RetLen;
11869+
11870+
SmallVector<APValue, 32> ResultElements;
11871+
ResultElements.reserve(RetLen);
11872+
for (unsigned I = 0; I < RetLen; ++I) {
11873+
if (MaskImm[I])
11874+
ResultElements.push_back(SourceVec.getVectorElt(Base + I));
11875+
else
11876+
ResultElements.push_back(MergeVec.getVectorElt(I));
11877+
}
11878+
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
11879+
}
11880+
1181411881
case clang::X86::BI__builtin_ia32_pavgb128:
1181511882
case clang::X86::BI__builtin_ia32_pavgw128:
1181611883
case clang::X86::BI__builtin_ia32_pavgb256:

clang/lib/Headers/avx512dqintrin.h

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1200,10 +1200,10 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
12001200
(__v8di)_mm512_setzero_si512());
12011201
}
12021202

1203-
#define _mm512_extractf32x8_ps(A, imm) \
1204-
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
1205-
(__v8sf)_mm256_undefined_ps(), \
1206-
(__mmask8)-1))
1203+
#define _mm512_extractf32x8_ps(A, imm) \
1204+
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
1205+
(__v8sf)_mm256_setzero_ps(), \
1206+
(__mmask8) - 1))
12071207

12081208
#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \
12091209
((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \
@@ -1215,11 +1215,10 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
12151215
(__v8sf)_mm256_setzero_ps(), \
12161216
(__mmask8)(U)))
12171217

1218-
#define _mm512_extractf64x2_pd(A, imm) \
1219-
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
1220-
(int)(imm), \
1221-
(__v2df)_mm_undefined_pd(), \
1222-
(__mmask8)-1))
1218+
#define _mm512_extractf64x2_pd(A, imm) \
1219+
((__m128d)__builtin_ia32_extractf64x2_512_mask( \
1220+
(__v8df)(__m512d)(A), (int)(imm), (__v2df)_mm_setzero_pd(), \
1221+
(__mmask8) - 1))
12231222

12241223
#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \
12251224
((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \
@@ -1233,10 +1232,10 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
12331232
(__v2df)_mm_setzero_pd(), \
12341233
(__mmask8)(U)))
12351234

1236-
#define _mm512_extracti32x8_epi32(A, imm) \
1237-
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
1238-
(__v8si)_mm256_undefined_si256(), \
1239-
(__mmask8)-1))
1235+
#define _mm512_extracti32x8_epi32(A, imm) \
1236+
((__m256i)__builtin_ia32_extracti32x8_mask( \
1237+
(__v16si)(__m512i)(A), (int)(imm), (__v8si)_mm256_setzero_si256(), \
1238+
(__mmask8) - 1))
12401239

12411240
#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \
12421241
((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \
@@ -1248,11 +1247,10 @@ _mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) {
12481247
(__v8si)_mm256_setzero_si256(), \
12491248
(__mmask8)(U)))
12501249

1251-
#define _mm512_extracti64x2_epi64(A, imm) \
1252-
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \
1253-
(int)(imm), \
1254-
(__v2di)_mm_undefined_si128(), \
1255-
(__mmask8)-1))
1250+
#define _mm512_extracti64x2_epi64(A, imm) \
1251+
((__m128i)__builtin_ia32_extracti64x2_512_mask( \
1252+
(__v8di)(__m512i)(A), (int)(imm), (__v2di)_mm_setzero_si128(), \
1253+
(__mmask8) - 1))
12561254

12571255
#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \
12581256
((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \

clang/lib/Headers/avx512fintrin.h

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3156,10 +3156,10 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
31563156
(__v16si)_mm512_setzero_si512()))
31573157
/* Vector Extract */
31583158

3159-
#define _mm512_extractf64x4_pd(A, I) \
3160-
((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3161-
(__v4df)_mm256_undefined_pd(), \
3162-
(__mmask8)-1))
3159+
#define _mm512_extractf64x4_pd(A, I) \
3160+
((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \
3161+
(__v4df)_mm256_setzero_pd(), \
3162+
(__mmask8) - 1))
31633163

31643164
#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \
31653165
((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \
@@ -3171,10 +3171,10 @@ _mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I,
31713171
(__v4df)_mm256_setzero_pd(), \
31723172
(__mmask8)(U)))
31733173

3174-
#define _mm512_extractf32x4_ps(A, I) \
3175-
((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3176-
(__v4sf)_mm_undefined_ps(), \
3177-
(__mmask8)-1))
3174+
#define _mm512_extractf32x4_ps(A, I) \
3175+
((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \
3176+
(__v4sf)_mm_setzero_ps(), \
3177+
(__mmask8) - 1))
31783178

31793179
#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \
31803180
((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \
@@ -7089,10 +7089,10 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
70897089
__builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
70907090
}
70917091

7092-
#define _mm512_extracti32x4_epi32(A, imm) \
7093-
((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
7094-
(__v4si)_mm_undefined_si128(), \
7095-
(__mmask8)-1))
7092+
#define _mm512_extracti32x4_epi32(A, imm) \
7093+
((__m128i)__builtin_ia32_extracti32x4_mask( \
7094+
(__v16si)(__m512i)(A), (int)(imm), (__v4si)_mm_setzero_si128(), \
7095+
(__mmask8) - 1))
70967096

70977097
#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \
70987098
((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \
@@ -7104,10 +7104,10 @@ _mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
71047104
(__v4si)_mm_setzero_si128(), \
71057105
(__mmask8)(U)))
71067106

7107-
#define _mm512_extracti64x4_epi64(A, imm) \
7107+
#define _mm512_extracti64x4_epi64(A, imm) \
71087108
((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \
7109-
(__v4di)_mm256_undefined_si256(), \
7110-
(__mmask8)-1))
7109+
(__v4di)_mm256_setzero_si256(), \
7110+
(__mmask8) - 1))
71117111

71127112
#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \
71137113
((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \

0 commit comments

Comments
 (0)