Skip to content

Commit cf85cf4

Browse files
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow insertps intrinsic to be used in constexp (#165513)
Resolves #165161
1 parent 2504f5f commit cf85cf4

File tree

4 files changed

+88
-13
lines changed

4 files changed

+88
-13
lines changed

clang/include/clang/Basic/BuiltinsX86.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,6 @@ let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorW
328328
}
329329

330330
let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
331-
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant char)">;
332331
def roundps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Constant int)">;
333332
def roundss : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
334333
def roundsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
@@ -342,6 +341,8 @@ let Features = "sse4.1", Attributes = [NoThrow, Const, RequiredVectorWidth<128>]
342341

343342
let Features = "sse4.1",
344343
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
344+
def insertps128 : X86Builtin<"_Vector<4, float>(_Vector<4, float>, "
345+
"_Vector<4, float>, _Constant char)">;
345346
def ptestz128
346347
: X86Builtin<"int(_Vector<2, long long int>, _Vector<2, long long int>)">;
347348
def ptestc128

clang/lib/AST/ByteCode/InterpBuiltin.cpp

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3411,7 +3411,7 @@ static bool interp__builtin_x86_byteshift(
34113411

34123412
static bool interp__builtin_ia32_shuffle_generic(
34133413
InterpState &S, CodePtr OpPC, const CallExpr *Call,
3414-
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
3414+
llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
34153415
GetSourceIndex) {
34163416

34173417
assert(Call->getNumArgs() == 3);
@@ -3428,8 +3428,19 @@ static bool interp__builtin_ia32_shuffle_generic(
34283428

34293429
for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
34303430
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
3431-
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
3432-
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
3431+
3432+
if (SrcIdx < 0) {
3433+
// Zero out this element
3434+
if (ElemT == PT_Float) {
3435+
Dst.elem<Floating>(DstIdx) = Floating(
3436+
S.getASTContext().getFloatTypeSemantics(VecT->getElementType()));
3437+
} else {
3438+
INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(DstIdx) = T::from(0); });
3439+
}
3440+
} else {
3441+
const Pointer &Src = (SrcVecIdx == 0) ? A : B;
3442+
TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
3443+
}
34333444
}
34343445
Dst.initializeAllElements();
34353446

@@ -4382,7 +4393,8 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
43824393
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
43834394
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
43844395
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
4385-
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
4396+
return std::pair<unsigned, int>{SrcIdx,
4397+
static_cast<int>(LaneOffset + Index)};
43864398
});
43874399
case X86::BI__builtin_ia32_shufpd:
43884400
case X86::BI__builtin_ia32_shufpd256:
@@ -4400,7 +4412,27 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
44004412
unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
44014413
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
44024414
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
4403-
return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
4415+
return std::pair<unsigned, int>{SrcIdx,
4416+
static_cast<int>(LaneOffset + Index)};
4417+
});
4418+
case X86::BI__builtin_ia32_insertps128:
4419+
return interp__builtin_ia32_shuffle_generic(
4420+
S, OpPC, Call, [](unsigned DstIdx, unsigned Mask) {
4421+
// Bits [3:0]: zero mask - if bit is set, zero this element
4422+
if ((Mask & (1 << DstIdx)) != 0) {
4423+
return std::pair<unsigned, int>{0, -1};
4424+
}
4425+
// Bits [7:6]: select element from source vector Y (0-3)
4426+
// Bits [5:4]: select destination position (0-3)
4427+
unsigned SrcElem = (Mask >> 6) & 0x3;
4428+
unsigned DstElem = (Mask >> 4) & 0x3;
4429+
if (DstIdx == DstElem) {
4430+
// Insert element from source vector (B) at this position
4431+
return std::pair<unsigned, int>{1, static_cast<int>(SrcElem)};
4432+
} else {
4433+
// Copy from destination vector (A)
4434+
return std::pair<unsigned, int>{0, static_cast<int>(DstIdx)};
4435+
}
44044436
});
44054437
case X86::BI__builtin_ia32_pshufb128:
44064438
case X86::BI__builtin_ia32_pshufb256:

clang/lib/AST/ExprConstant.cpp

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11621,7 +11621,7 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
1162111621

1162211622
static bool evalShuffleGeneric(
1162311623
EvalInfo &Info, const CallExpr *Call, APValue &Out,
11624-
llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
11624+
llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned)>
1162511625
GetSourceIndex) {
1162611626

1162711627
const auto *VT = Call->getType()->getAs<VectorType>();
@@ -11644,8 +11644,16 @@ static bool evalShuffleGeneric(
1164411644

1164511645
for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
1164611646
auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
11647-
const APValue &Src = (SrcVecIdx == 0) ? A : B;
11648-
ResultElements.push_back(Src.getVectorElt(SrcIdx));
11647+
11648+
if (SrcIdx < 0) {
11649+
// Zero out this element
11650+
QualType ElemTy = VT->getElementType();
11651+
ResultElements.push_back(
11652+
APValue(APFloat::getZero(Info.Ctx.getFloatTypeSemantics(ElemTy))));
11653+
} else {
11654+
const APValue &Src = (SrcVecIdx == 0) ? A : B;
11655+
ResultElements.push_back(Src.getVectorElt(SrcIdx));
11656+
}
1164911657
}
1165011658

1165111659
Out = APValue(ResultElements.data(), ResultElements.size());
@@ -12438,7 +12446,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1243812446
if (!evalShuffleGeneric(
1243912447
Info, E, R,
1244012448
[](unsigned DstIdx,
12441-
unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
12449+
unsigned ShuffleMask) -> std::pair<unsigned, int> {
1244212450
constexpr unsigned LaneBits = 128u;
1244312451
unsigned NumElemPerLane = LaneBits / 32;
1244412452
unsigned NumSelectableElems = NumElemPerLane / 2;
@@ -12451,7 +12459,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1245112459
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
1245212460
unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
1245312461
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
12454-
return {SrcIdx, LaneOffset + Index};
12462+
return {SrcIdx, static_cast<int>(LaneOffset + Index)};
1245512463
}))
1245612464
return false;
1245712465
return Success(R, E);
@@ -12463,7 +12471,7 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1246312471
if (!evalShuffleGeneric(
1246412472
Info, E, R,
1246512473
[](unsigned DstIdx,
12466-
unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
12474+
unsigned ShuffleMask) -> std::pair<unsigned, int> {
1246712475
constexpr unsigned LaneBits = 128u;
1246812476
unsigned NumElemPerLane = LaneBits / 64;
1246912477
unsigned NumSelectableElems = NumElemPerLane / 2;
@@ -12476,7 +12484,31 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
1247612484
unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
1247712485
unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
1247812486
unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
12479-
return {SrcIdx, LaneOffset + Index};
12487+
return {SrcIdx, static_cast<int>(LaneOffset + Index)};
12488+
}))
12489+
return false;
12490+
return Success(R, E);
12491+
}
12492+
case X86::BI__builtin_ia32_insertps128: {
12493+
APValue R;
12494+
if (!evalShuffleGeneric(
12495+
Info, E, R,
12496+
[](unsigned DstIdx, unsigned Mask) -> std::pair<unsigned, int> {
12497+
// Bits [3:0]: zero mask - if bit is set, zero this element
12498+
if ((Mask & (1 << DstIdx)) != 0) {
12499+
return {0, -1};
12500+
}
12501+
// Bits [7:6]: select element from source vector Y (0-3)
12502+
// Bits [5:4]: select destination position (0-3)
12503+
unsigned SrcElem = (Mask >> 6) & 0x3;
12504+
unsigned DstElem = (Mask >> 4) & 0x3;
12505+
if (DstIdx == DstElem) {
12506+
// Insert element from source vector (B) at this position
12507+
return {1, static_cast<int>(SrcElem)};
12508+
} else {
12509+
// Copy from destination vector (A)
12510+
return {0, static_cast<int>(DstIdx)};
12511+
}
1248012512
}))
1248112513
return false;
1248212514
return Success(R, E);

clang/test/CodeGen/X86/sse41-builtins.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,16 @@ __m128 test_mm_insert_ps(__m128 x, __m128 y) {
307307
return _mm_insert_ps(x, y, 4);
308308
}
309309

310+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x10), 1.0f, 10.0f, 3.0f, 4.0f))); // Insert Y[0] into X[1]
311+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x00), 10.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0]
312+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x20), 1.0f, 2.0f, 10.0f, 4.0f))); // Insert Y[0] into X[2]
313+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x30), 1.0f, 2.0f, 3.0f, 10.0f))); // Insert Y[0] into X[3]
314+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x80), 30.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[2] into X[0]
315+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x01), 0.0f, 2.0f, 3.0f, 4.0f))); // Insert Y[0] into X[0], zero X[0]
316+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0A), 10.0f, 0.0f, 3.0f, 0.0f))); // Insert Y[0] into X[0], zero X[1] and X[3]
317+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0x0F), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[0] into X[0], zero all
318+
TEST_CONSTEXPR((match_m128(_mm_insert_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){10.0f, 20.0f, 30.0f, 40.0f}), 0xCF), 0.0f, 0.0f, 0.0f, 0.0f))); // Insert Y[3] into X[0], zero all
319+
310320
__m128i test_mm_max_epi8(__m128i x, __m128i y) {
311321
// CHECK-LABEL: test_mm_max_epi8
312322
// CHECK: call <16 x i8> @llvm.smax.v16i8(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})

0 commit comments

Comments
 (0)