Skip to content
21 changes: 12 additions & 9 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -497,14 +497,14 @@ let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWid
def blendps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
def blendvpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Vector<4, double>)">;
def blendvps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Vector<8, float>)">;
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
}

let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vpermilpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Constant int)">;
def vpermilps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
def vinsertf128_pd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def vinsertf128_ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def vinsertf128_si256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
def sqrtpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>)">;
def sqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
def rsqrtps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>)">;
Expand All @@ -513,6 +513,7 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def roundps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Constant int)">;
}


let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vtestzpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
def vtestcpd : X86Builtin<"int(_Vector<2, double>, _Vector<2, double>)">;
Expand Down Expand Up @@ -609,9 +610,9 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def permti256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>, _Constant int)">;
def permdi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Constant int)">;
def extract128i256 : X86Builtin<"_Vector<2, long long int>(_Vector<4, long long int>, _Constant int)">;
def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
}


let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def pavgb256 : X86Builtin<"_Vector<32, unsigned char>(_Vector<32, unsigned char>, _Vector<32, unsigned char>)">;
def pavgw256 : X86Builtin<"_Vector<16, unsigned short>(_Vector<16, unsigned short>, _Vector<16, unsigned short>)">;
Expand Down Expand Up @@ -644,6 +645,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
def psrlv8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def psllv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;
def psrlv4di : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<4, long long int>)">;

def insert128i256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
}

let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
Expand Down Expand Up @@ -2945,29 +2948,29 @@ let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256
def extracti32x4_256_mask : X86Builtin<"_Vector<4, int>(_Vector<8, int>, _Constant int, _Vector<4, int>, unsigned char)">;
}

let Features = "avx512dq", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
let Features = "avx512dq", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def insertf32x8 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<8, float>, _Constant int)">;
def insertf64x2_512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<2, double>, _Constant int)">;
def inserti32x8 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<8, int>, _Constant int)">;
def inserti64x2_512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<2, long long int>, _Constant int)">;
}

let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def insertf64x4 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<4, double>, _Constant int)">;
def inserti64x4 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<4, long long int>, _Constant int)">;
}

let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
let Features = "avx512dq,avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def insertf64x2_256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<2, double>, _Constant int)">;
def inserti64x2_256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>, _Constant int)">;
}

let Features = "avx512vl", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
let Features = "avx512vl", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def insertf32x4_256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<4, float>, _Constant int)">;
def inserti32x4_256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>, _Constant int)">;
}

let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def insertf32x4 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<4, float>, _Constant int)">;
def inserti32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<4, int>, _Constant int)">;
}
Expand Down
59 changes: 59 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2914,6 +2914,47 @@ static bool interp__builtin_elementwise_triop(
return true;
}

static bool interp__builtin_x86_insert_subvector(InterpState &S, CodePtr OpPC,
const CallExpr *Call,
unsigned ID) {
assert(Call->getNumArgs() == 3);

APSInt ImmAPS = popToAPSInt(S, Call->getArg(2));
uint64_t Index = ImmAPS.getZExtValue();

const Pointer &SubVec = S.Stk.pop<Pointer>();
if (!SubVec.getFieldDesc()->isPrimitiveArray())
return false;

const Pointer &BaseVec = S.Stk.pop<Pointer>();
if (!BaseVec.getFieldDesc()->isPrimitiveArray())
return false;

const Pointer &Dst = S.Stk.peek<Pointer>();

unsigned BaseElements = BaseVec.getNumElems();
unsigned SubElements = SubVec.getNumElems();

assert(SubElements != 0 && BaseElements != 0 && (BaseElements % SubElements) == 0);

unsigned NumLanes = BaseElements / SubElements;
unsigned Lane = static_cast<unsigned>(Index % NumLanes);
unsigned InsertPos = Lane * SubElements;

PrimType ElemPT = BaseVec.getFieldDesc()->getPrimType();

TYPE_SWITCH(ElemPT, {
for (unsigned I = 0; I != BaseElements; ++I)
Dst.elem<T>(I) = BaseVec.elem<T>(I);
for (unsigned I = 0; I != SubElements; ++I)
Dst.elem<T>(InsertPos + I) = SubVec.elem<T>(I);
});

Dst.initializeAllElements();

return true;
}

bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
uint32_t BuiltinID) {
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
Expand Down Expand Up @@ -3572,6 +3613,24 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return interp__builtin_elementwise_triop(S, OpPC, Call,
llvm::APIntOps::fshr);

case X86::BI__builtin_ia32_insertf32x4_256:
case X86::BI__builtin_ia32_inserti32x4_256:
case X86::BI__builtin_ia32_insertf64x2_256:
case X86::BI__builtin_ia32_inserti64x2_256:
case X86::BI__builtin_ia32_insertf32x4:
case X86::BI__builtin_ia32_inserti32x4:
case X86::BI__builtin_ia32_insertf64x2_512:
case X86::BI__builtin_ia32_inserti64x2_512:
case X86::BI__builtin_ia32_insertf32x8:
case X86::BI__builtin_ia32_inserti32x8:
case X86::BI__builtin_ia32_insertf64x4:
case X86::BI__builtin_ia32_inserti64x4:
case X86::BI__builtin_ia32_vinsertf128_ps256:
case X86::BI__builtin_ia32_vinsertf128_pd256:
case X86::BI__builtin_ia32_vinsertf128_si256:
case X86::BI__builtin_ia32_insert128i256:
return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);

default:
S.FFDiag(S.Current->getLocation(OpPC),
diag::note_invalid_subexpr_in_const_expr)
Expand Down
45 changes: 45 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12128,6 +12128,51 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {

return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}

case X86::BI__builtin_ia32_insertf32x4_256:
case X86::BI__builtin_ia32_inserti32x4_256:
case X86::BI__builtin_ia32_insertf64x2_256:
case X86::BI__builtin_ia32_inserti64x2_256:
case X86::BI__builtin_ia32_insertf32x4:
case X86::BI__builtin_ia32_inserti32x4:
case X86::BI__builtin_ia32_insertf64x2_512:
case X86::BI__builtin_ia32_inserti64x2_512:
case X86::BI__builtin_ia32_insertf32x8:
case X86::BI__builtin_ia32_inserti32x8:
case X86::BI__builtin_ia32_insertf64x4:
case X86::BI__builtin_ia32_inserti64x4:
case X86::BI__builtin_ia32_vinsertf128_ps256:
case X86::BI__builtin_ia32_vinsertf128_pd256:
case X86::BI__builtin_ia32_vinsertf128_si256:
case X86::BI__builtin_ia32_insert128i256: {
APValue SourceDst, SourceSub;
if (!EvaluateAsRValue(Info, E->getArg(0), SourceDst) ||
!EvaluateAsRValue(Info, E->getArg(1), SourceSub))
return false;

APSInt Imm;
if (!EvaluateInteger(E->getArg(2), Imm, Info))
return false;

assert(SourceDst.isVector() && SourceSub.isVector());
unsigned DstLen = SourceDst.getVectorLength();
unsigned SubLen = SourceSub.getVectorLength();
assert(SubLen != 0 && DstLen != 0 && (DstLen % SubLen) == 0);
unsigned NumLanes = DstLen / SubLen;
unsigned LaneIdx = (Imm.getZExtValue() % NumLanes) * SubLen;

SmallVector<APValue, 16> ResultElements;
ResultElements.reserve(DstLen);

for (unsigned EltNum = 0; EltNum < DstLen; ++EltNum) {
if (EltNum >= LaneIdx && EltNum < LaneIdx + SubLen)
ResultElements.push_back(SourceSub.getVectorElt(EltNum - LaneIdx));
else
ResultElements.push_back(SourceDst.getVectorElt(EltNum));
}

return Success(APValue(ResultElements.data(), ResultElements.size()), E);
}
}
}

Expand Down
3 changes: 3 additions & 0 deletions clang/test/CodeGen/X86/avx-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -1144,20 +1144,23 @@ __m256d test_mm256_insertf128_pd(__m256d A, __m128d B) {
// CHECK: shufflevector <4 x double> %{{.*}}, <4 x double> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
return _mm256_insertf128_pd(A, B, 0);
}
TEST_CONSTEXPR(match_m256d(_mm256_insertf128_pd(((__m256d){1.0, 2.0, 3.0, 4.0}), ((__m128d){5.0, 6.0}), 0), 5.0, 6.0, 3.0, 4.0));

__m256 test_mm256_insertf128_ps(__m256 A, __m128 B) {
// CHECK-LABEL: test_mm256_insertf128_ps
// CHECK: shufflevector <4 x float> %{{.*}}, <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
return _mm256_insertf128_ps(A, B, 1);
}
TEST_CONSTEXPR(match_m256(_mm256_insertf128_ps(((__m256){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m128){10.0f, 20.0f, 30.0f, 40.0f}), 1), 1.0f, 2.0f, 3.0f, 4.0f, 10.0f, 20.0f, 30.0f, 40.0f));

__m256i test_mm256_insertf128_si256(__m256i A, __m128i B) {
// CHECK-LABEL: test_mm256_insertf128_si256
// CHECK: shufflevector <4 x i32> %{{.*}}, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
// CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
return _mm256_insertf128_si256(A, B, 0);
}
TEST_CONSTEXPR(match_m256i(_mm256_insertf128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m128i){10ULL, 20ULL}), 0), 10ULL, 20ULL, 3ULL, 4ULL));

__m256i test_mm256_lddqu_si256(__m256i* A) {
// CHECK-LABEL: test_mm256_lddqu_si256
Expand Down
1 change: 1 addition & 0 deletions clang/test/CodeGen/X86/avx2-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,7 @@ __m256i test0_mm256_inserti128_si256(__m256i a, __m128i b) {
// CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
return _mm256_inserti128_si256(a, b, 0);
}
TEST_CONSTEXPR(match_m256i(_mm256_inserti128_si256(((__m256i){1ULL, 2ULL, 3ULL, 4ULL}), ((__m128i){10ULL, 20ULL}), 0), 10ULL, 20ULL, 3ULL, 4ULL));

__m256i test1_mm256_inserti128_si256(__m256i a, __m128i b) {
// CHECK-LABEL: test1_mm256_inserti128_si256
Expand Down
Loading
Loading