Skip to content

Commit 0ade346

Browse files
authored
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow AVX/AVX512 subvector insertion intrinsics to be used in constexpr #157709 (#158778)
AVX/AVX512 vector insert intrinsics now support constexpr evaluation in both the AST evaluator and bytecode interpreter paths. FIXES: #157709
1 parent b864909 commit 0ade346

File tree

9 files changed

+158
-9
lines changed

9 files changed

+158
-9
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -497,14 +497,14 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
497497
def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
498498
def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
499499
def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
500+
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
501+
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
502+
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
500503
}
501504

502505
let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
503506
def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
504507
def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
505-
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
506-
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
507-
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
508508
def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
509509
def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
510510
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
@@ -513,6 +513,7 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
513513
def roundps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
514514
}
515515

516+
516517
let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
517518
def vtestzpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
518519
def vtestcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
@@ -609,9 +610,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
609610
def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
610611
def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
611612
def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
612-
def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
613613
}
614614

615+
615616
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
616617
def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
617618
def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
@@ -644,6 +645,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
644645
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
645646
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
646647
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
648+
649+
def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
647650
}
648651

649652
let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
@@ -2945,29 +2948,29 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256
29452948
def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
29462949
}
29472950

2948-
let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
2951+
let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
29492952
def insertf32x8 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<8, float>, _Constant int)">;
29502953
def insertf64x2_512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<2, double>, _Constant int)">;
29512954
def inserti32x8 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<8, int>, _Constant int)">;
29522955
def inserti64x2_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>, _Constant int)">;
29532956
}
29542957

2955-
let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
2958+
let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
29562959
def insertf64x4 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<4, double>, _Constant int)">;
29572960
def inserti64x4 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<4, long long int>, _Constant int)">;
29582961
}
29592962

2960-
let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
2963+
let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
29612964
def insertf64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
29622965
def inserti64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
29632966
}
29642967

2965-
let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
2968+
let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
29662969
def insertf32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
29672970
def inserti32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
29682971
}
29692972

2970-
let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
2973+
let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
29712974
def insertf32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<4, float>, _Constant int)">;
29722975
def inserti32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>, _Constant int)">;
29732976
}

clang/lib/AST/ByteCode/InterpBuiltin.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2914,6 +2914,48 @@ static bool interp__builtin_elementwise_triop(
29142914
return true;
29152915
}
29162916

2917+
static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
2918+
const CallExpr *Call,
2919+
unsigned ID) {
2920+
assert(Call->getNumArgs() == 3);
2921+
2922+
APSInt ImmAPS = popToAPSInt(S, Call->getArg(2));
2923+
uint64_t Index = ImmAPS.getZExtValue();
2924+
2925+
const Pointer &SubVec = S.Stk.pop<Pointer>();
2926+
if (!SubVec.getFieldDesc()->isPrimitiveArray())
2927+
return false;
2928+
2929+
const Pointer &BaseVec = S.Stk.pop<Pointer>();
2930+
if (!BaseVec.getFieldDesc()->isPrimitiveArray())
2931+
return false;
2932+
2933+
const Pointer &Dst = S.Stk.peek<Pointer>();
2934+
2935+
unsigned BaseElements = BaseVec.getNumElems();
2936+
unsigned SubElements = SubVec.getNumElems();
2937+
2938+
assert(SubElements != 0 && BaseElements != 0 &&
2939+
(BaseElements % SubElements) == 0);
2940+
2941+
unsigned NumLanes = BaseElements / SubElements;
2942+
unsigned Lane = static_cast<unsigned>(Index % NumLanes);
2943+
unsigned InsertPos = Lane * SubElements;
2944+
2945+
PrimType ElemPT = BaseVec.getFieldDesc()->getPrimType();
2946+
2947+
TYPE_SWITCH(ElemPT, {
2948+
for (unsigned I = 0; I != BaseElements; ++I)
2949+
Dst.elem<T>(I) = BaseVec.elem<T>(I);
2950+
for (unsigned I = 0; I != SubElements; ++I)
2951+
Dst.elem<T>(InsertPos + I) = SubVec.elem<T>(I);
2952+
});
2953+
2954+
Dst.initializeAllElements();
2955+
2956+
return true;
2957+
}
2958+
29172959
bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
29182960
uint32_t BuiltinID) {
29192961
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3572,6 +3614,24 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
35723614
return interp__builtin_elementwise_triop(S, OpPC, Call,
35733615
llvm::APIntOps::fshr);
35743616

3617+
case X86::BI__builtin_ia32_insertf32x4_256:
3618+
case X86::BI__builtin_ia32_inserti32x4_256:
3619+
case X86::BI__builtin_ia32_insertf64x2_256:
3620+
case X86::BI__builtin_ia32_inserti64x2_256:
3621+
case X86::BI__builtin_ia32_insertf32x4:
3622+
case X86::BI__builtin_ia32_inserti32x4:
3623+
case X86::BI__builtin_ia32_insertf64x2_512:
3624+
case X86::BI__builtin_ia32_inserti64x2_512:
3625+
case X86::BI__builtin_ia32_insertf32x8:
3626+
case X86::BI__builtin_ia32_inserti32x8:
3627+
case X86::BI__builtin_ia32_insertf64x4:
3628+
case X86::BI__builtin_ia32_inserti64x4:
3629+
case X86::BI__builtin_ia32_vinsertf128_ps256:
3630+
case X86::BI__builtin_ia32_vinsertf128_pd256:
3631+
case X86::BI__builtin_ia32_vinsertf128_si256:
3632+
case X86::BI__builtin_ia32_insert128i256:
3633+
return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);
3634+
35753635
default:
35763636
S.FFDiag(S.Current->getLocation(OpPC),
35773637
diag::note_invalid_subexpr_in_const_expr)

clang/lib/AST/ExprConstant.cpp

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12128,6 +12128,51 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1212812128

1212912129
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
1213012130
}
12131+
12132+
case X86::BI__builtin_ia32_insertf32x4_256:
12133+
case X86::BI__builtin_ia32_inserti32x4_256:
12134+
case X86::BI__builtin_ia32_insertf64x2_256:
12135+
case X86::BI__builtin_ia32_inserti64x2_256:
12136+
case X86::BI__builtin_ia32_insertf32x4:
12137+
case X86::BI__builtin_ia32_inserti32x4:
12138+
case X86::BI__builtin_ia32_insertf64x2_512:
12139+
case X86::BI__builtin_ia32_inserti64x2_512:
12140+
case X86::BI__builtin_ia32_insertf32x8:
12141+
case X86::BI__builtin_ia32_inserti32x8:
12142+
case X86::BI__builtin_ia32_insertf64x4:
12143+
case X86::BI__builtin_ia32_inserti64x4:
12144+
case X86::BI__builtin_ia32_vinsertf128_ps256:
12145+
case X86::BI__builtin_ia32_vinsertf128_pd256:
12146+
case X86::BI__builtin_ia32_vinsertf128_si256:
12147+
case X86::BI__builtin_ia32_insert128i256: {
12148+
APValue SourceDst, SourceSub;
12149+
if (!EvaluateAsRValue(Info, E->getArg(0), SourceDst) ||
12150+
!EvaluateAsRValue(Info, E->getArg(1), SourceSub))
12151+
return false;
12152+
12153+
APSInt Imm;
12154+
if (!EvaluateInteger(E->getArg(2), Imm, Info))
12155+
return false;
12156+
12157+
assert(SourceDst.isVector() && SourceSub.isVector());
12158+
unsigned DstLen = SourceDst.getVectorLength();
12159+
unsigned SubLen = SourceSub.getVectorLength();
12160+
assert(SubLen != 0 && DstLen != 0 && (DstLen % SubLen) == 0);
12161+
unsigned NumLanes = DstLen / SubLen;
12162+
unsigned LaneIdx = (Imm.getZExtValue() % NumLanes) * SubLen;
12163+
12164+
SmallVector<APValue, 16> ResultElements;
12165+
ResultElements.reserve(DstLen);
12166+
12167+
for (unsigned EltNum = 0; EltNum < DstLen; ++EltNum) {
12168+
if (EltNum >= LaneIdx && EltNum < LaneIdx + SubLen)
12169+
ResultElements.push_back(SourceSub.getVectorElt(EltNum - LaneIdx));
12170+
else
12171+
ResultElements.push_back(SourceDst.getVectorElt(EltNum));
12172+
}
12173+
12174+
return Success(APValue(ResultElements.data(), ResultElements.size()), E);
12175+
}
1213112176
}
1213212177
}
1213312178

clang/test/CodeGen/X86/avx-builtins.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1144,20 +1144,23 @@ __m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
11441144
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
11451145
return _mm256_insertf128_pd(A, B, 0);
11461146
}
1147+
TEST_CONSTEXPR(match_m256d(_mm256_insertf128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m128d){5.0, 6.0}), 0), 5.0, 6.0, 3.0, 4.0));
11471148

11481149
__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
11491150
// CHECK-LABEL: test_mm256_insertf128_ps
11501151
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
11511152
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
11521153
return _mm256_insertf128_ps(A, B, 1);
11531154
}
1155+
TEST_CONSTEXPR(match_m256(_mm256_insertf128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m128){10.0f, 20.0f, 30.0f, 40.0f}), 1), 1.0f, 2.0f, 3.0f, 4.0f, 10.0f, 20.0f, 30.0f, 40.0f));
11541156

11551157
__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
11561158
// CHECK-LABEL: test_mm256_insertf128_si256
11571159
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
11581160
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
11591161
return _mm256_insertf128_si256(A, B, 0);
11601162
}
1163+
TEST_CONSTEXPR(match_m256i(_mm256_insertf128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m128i){10ULL, 20ULL}), 0), 10ULL, 20ULL, 3ULL, 4ULL));
11611164

11621165
__m256i test_mm256_lddqu_si256(__m256i* A) {
11631166
// CHECK-LABEL: test_mm256_lddqu_si256

clang/test/CodeGen/X86/avx2-builtins.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -779,6 +779,7 @@ __m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
779779
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
780780
return _mm256_inserti128_si256(a, b, 0);
781781
}
782+
TEST_CONSTEXPR(match_m256i(_mm256_inserti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m128i){10ULL, 20ULL}), 0), 10ULL, 20ULL, 3ULL, 4ULL));
782783

783784
__m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
784785
// CHECK-LABEL: test1_mm256_inserti128_si256

0 commit comments

Comments
 (0)