-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[Headers][X86] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow PALIGNR byte shift intrinsics to be used in constexpr #162005
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
28819c6
4247c75
c70e2c7
fc26847
6d3534e
11ae9f1
a6e870d
2f0b435
e1c5df0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3447,6 +3447,44 @@ static bool interp__builtin_ia32_shuffle_generic( | |
| return true; | ||
| } | ||
|
|
||
| static bool interp__builtin_x86_palignr( | ||
| InterpState &S, CodePtr OpPC, const CallExpr *Call, | ||
| llvm::function_ref<std::pair<unsigned, int>(unsigned, unsigned, unsigned)> | ||
| GetSourceIndex) { | ||
|
|
||
| assert(Call->getNumArgs() == 3); | ||
| unsigned Shift = popToAPSInt(S, Call->getArg(2)).getZExtValue() & 0xff; | ||
|
|
||
| QualType Arg0Type = Call->getArg(0)->getType(); | ||
| const auto *VecT = Arg0Type->castAs<VectorType>(); | ||
| PrimType ElemT = *S.getContext().classify(VecT->getElementType()); | ||
| unsigned NumElems = VecT->getNumElements(); | ||
|
|
||
| const Pointer &B = S.Stk.pop<Pointer>(); | ||
| const Pointer &A = S.Stk.pop<Pointer>(); | ||
| const Pointer &Dst = S.Stk.peek<Pointer>(); | ||
|
|
||
| for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) { | ||
| auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, Shift, NumElems); | ||
|
|
||
| if (SrcIdx < 0) { | ||
| // Zero out this element | ||
| if (ElemT == PT_Float) { | ||
| Dst.elem<Floating>(DstIdx) = Floating( | ||
| S.getASTContext().getFloatTypeSemantics(VecT->getElementType())); | ||
| } else { | ||
| INT_TYPE_SWITCH_NO_BOOL(ElemT, { Dst.elem<T>(DstIdx) = T::from(0); }); | ||
| } | ||
| } else { | ||
| const Pointer &Src = (SrcVecIdx == 0) ? A : B; | ||
| TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); }); | ||
| } | ||
| } | ||
| Dst.initializeAllElements(); | ||
|
|
||
| return true; | ||
| } | ||
|
|
||
| bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, | ||
| uint32_t BuiltinID) { | ||
| if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID)) | ||
|
|
@@ -4636,6 +4674,26 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call, | |
| return APInt(8, 0); | ||
| }); | ||
|
|
||
| case X86::BI__builtin_ia32_palignr128: | ||
| case X86::BI__builtin_ia32_palignr256: | ||
| case X86::BI__builtin_ia32_palignr512: | ||
| return interp__builtin_x86_palignr( | ||
| S, OpPC, Call, [](unsigned DstIdx, unsigned Shift, unsigned NumElems) { | ||
| // Default to -1 → zero-fill this destination element | ||
| unsigned VecIdx = 0; | ||
| int ElemIdx = -1; | ||
|
|
||
| // Elements come from VecB first, then VecA after the shift boundary | ||
| unsigned ShiftedIdx = DstIdx + Shift; | ||
| if (ShiftedIdx < NumElems) { // from VecB | ||
|
||
| VecIdx = 1; | ||
| ElemIdx = DstIdx + Shift; | ||
| } else if (ShiftedIdx < 2 * NumElems) { // from VecA | ||
| ElemIdx = DstIdx + Shift - NumElems; | ||
| } | ||
| return std::pair<unsigned, int>{VecIdx, ElemIdx}; | ||
| }); | ||
|
|
||
| default: | ||
| S.FFDiag(S.Current->getLocation(OpPC), | ||
| diag::note_invalid_subexpr_in_const_expr) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13080,6 +13080,42 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) { | |
|
|
||
| return Success(APValue(ResultElements.data(), ResultElements.size()), E); | ||
| } | ||
|
|
||
| case X86::BI__builtin_ia32_palignr128: | ||
| case X86::BI__builtin_ia32_palignr256: | ||
| case X86::BI__builtin_ia32_palignr512: { | ||
| assert(E->getNumArgs() == 3); | ||
|
||
|
|
||
| APValue VecA, VecB; | ||
| APSInt Imm; | ||
| if (!EvaluateAsRValue(Info, E->getArg(0), VecA) || | ||
| !EvaluateAsRValue(Info, E->getArg(1), VecB) || | ||
| !EvaluateInteger(E->getArg(2), Imm, Info)) | ||
| return false; | ||
|
|
||
| if (!VecA.isVector() || !VecB.isVector()) | ||
| return false; | ||
|
|
||
| unsigned LenA = VecA.getVectorLength(); | ||
| unsigned LenB = VecB.getVectorLength(); | ||
| assert(LenA == LenB && (LenA % 16 == 0)); | ||
|
|
||
| unsigned Shift = Imm.getZExtValue() & 0xff; | ||
| SmallVector<APValue> ResultElements; | ||
| for (unsigned I = 0; I < LenA; ++I) { | ||
| if (I + Shift < LenA) { | ||
| ResultElements.push_back(VecB.getVectorElt(I + Shift)); | ||
| } else if (I + Shift < LenA + LenB) { | ||
| ResultElements.push_back(VecA.getVectorElt(I + Shift - LenA)); | ||
| } else { | ||
| APSInt Zero(/*BitWidth=*/8, /*isUnsigned=*/true); | ||
| Zero = 0; | ||
| ResultElements.push_back(APValue(Zero)); | ||
| } | ||
| } | ||
|
|
||
| return Success(APValue(ResultElements.data(), ResultElements.size()), E); | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -109,12 +109,14 @@ __m256i test_mm256_alignr_epi8(__m256i a, __m256i b) { | |
| // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49> | ||
| return _mm256_alignr_epi8(a, b, 2); | ||
| } | ||
| TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 2), 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 1, 2)); | ||
|
|
||
| __m256i test2_mm256_alignr_epi8(__m256i a, __m256i b) { | ||
| // CHECK-LABEL: test2_mm256_alignr_epi8 | ||
| // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48> | ||
| return _mm256_alignr_epi8(a, b, 17); | ||
| } | ||
| TEST_CONSTEXPR(match_v32qi(_mm256_alignr_epi8(((__m256i)(__v32qs){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), ((__m256i)(__v32qs){33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 64), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)); | ||
|
||
|
|
||
| __m256i test_mm256_and_si256(__m256i a, __m256i b) { | ||
| // CHECK-LABEL: test_mm256_and_si256 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should be able to use interp__builtin_ia32_shuffle_generic now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. The callback used in
interp__builtin_ia32_shuffle_genericonly accepts two parameters,but in this case we need a callback that takes three parameters.
Maybe
interp__builtin_x86_palignrshould be renamed tointerp__builtin_ia32_shuffle_generic3, adding the number 3 to the end of the name.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its fine to you to update interp__builtin_ia32_shuffle_generic to take a third numelts parameter - adjusting the other users should be trivial, or you split the 128/256/512 callbacks so the numelts are implicit - up to you