-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - Allow AVX512 VPSHUFBITQMB intrinsics to be used in constexpr #168100
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-clang @llvm/pr-subscribers-backend-x86 Author: NagaChaitanya Vellanki (chaitanyav) ChangesResolves: #161337 Full diff: https://github.com/llvm/llvm-project/pull/168100.diff 7 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 69d18679fd6ec..289409c622ca0 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -1390,15 +1390,15 @@ let Features = "avx512cd", Attributes = [NoThrow, Const, Constexpr, RequiredVect
def vpconflictsi_512 : X86Builtin<"_Vector<16, int>(_Vector<16, int>)">;
}
-let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vpshufbitqmb128_mask : X86Builtin<"unsigned short(_Vector<16, char>, _Vector<16, char>, unsigned short)">;
}
-let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "avx512vl,avx512bitalg", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vpshufbitqmb256_mask : X86Builtin<"unsigned int(_Vector<32, char>, _Vector<32, char>, unsigned int)">;
}
-let Features = "avx512bitalg", Attributes = [NoThrow, Const, RequiredVectorWidth<512>] in {
+let Features = "avx512bitalg", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
def vpshufbitqmb512_mask : X86Builtin<"unsigned long long int(_Vector<64, char>, _Vector<64, char>, unsigned long long int)">;
}
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index cee3c1b8cf8f3..76e27457b8cca 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3391,6 +3391,72 @@ static bool interp__builtin_ia32_shuffle_generic(
return true;
}
+static bool interp__builtin_ia32_shufbitqmb_mask(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call) {
+
+ assert(Call->getNumArgs() == 3);
+
+ QualType SourceType = Call->getArg(0)->getType();
+ QualType ShuffleMaskType = Call->getArg(1)->getType();
+ QualType ZeroMaskType = Call->getArg(2)->getType();
+ if (!SourceType->isVectorType() || !ShuffleMaskType->isVectorType() ||
+ !ZeroMaskType->isIntegerType()) {
+ return false;
+ }
+
+ Pointer Source, ShuffleMask;
+ APSInt ZeroMask = popToAPSInt(S, Call->getArg(2));
+ ShuffleMask = S.Stk.pop<Pointer>();
+ Source = S.Stk.pop<Pointer>();
+
+ const auto *SourceVecT = SourceType->castAs<VectorType>();
+ const auto *ShuffleMaskVecT = ShuffleMaskType->castAs<VectorType>();
+ assert(SourceVecT->getNumElements() == ShuffleMaskVecT->getNumElements());
+ assert(ZeroMask.getBitWidth() == SourceVecT->getNumElements());
+
+ PrimType SourceElemT = *S.getContext().classify(SourceVecT->getElementType());
+ PrimType ShuffleMaskElemT =
+ *S.getContext().classify(ShuffleMaskVecT->getElementType());
+
+ const unsigned NumBytesInQWord = 8;
+ const unsigned NumBitsInByte = 8;
+ const unsigned NumBytes = SourceVecT->getNumElements();
+ const unsigned NumQWords = NumBytes / NumBytesInQWord;
+ const unsigned RetWidth = ZeroMask.getBitWidth();
+ APSInt RetMask(llvm::APInt(RetWidth, 0), /*isUnsigned=*/true);
+
+ for (unsigned QWordId = 0; QWordId != NumQWords; ++QWordId) {
+
+ APInt SourceQWord(64, 0);
+ for (unsigned ByteInQWord = 0; ByteInQWord != NumBytesInQWord;
+ ++ByteInQWord) {
+ uint64_t Byte = 0;
+ INT_TYPE_SWITCH(SourceElemT, {
+ Byte = static_cast<uint64_t>(
+ Source.elem<T>(QWordId * NumBytesInQWord + ByteInQWord));
+ });
+ SourceQWord |= (Byte & 0xFF) << (ByteInQWord * NumBitsInByte);
+ }
+
+ for (unsigned ByteInQWord = 0; ByteInQWord != NumBytesInQWord;
+ ++ByteInQWord) {
+ unsigned ByteIdx = QWordId * NumBytesInQWord + ByteInQWord;
+ unsigned M = 0;
+ INT_TYPE_SWITCH(ShuffleMaskElemT, {
+ M = static_cast<unsigned>(ShuffleMask.elem<T>(ByteIdx)) & 0x3F;
+ });
+
+ if (ZeroMask[ByteIdx]) {
+ RetMask.setBitVal(ByteIdx, SourceQWord[M]);
+ }
+ }
+ }
+
+ pushInteger(S, RetMask, Call->getType());
+
+ return true;
+}
+
bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
uint32_t BuiltinID) {
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -4712,6 +4778,12 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_ucmpq512_mask:
return interp__builtin_ia32_cmp_mask(S, OpPC, Call, BuiltinID,
/*IsUnsigned=*/true);
+
+ case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
+ case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
+ case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
+ return interp__builtin_ia32_shufbitqmb_mask(S, OpPC, Call);
+
case X86::BI__builtin_ia32_pslldqi128_byteshift:
case X86::BI__builtin_ia32_pslldqi256_byteshift:
case X86::BI__builtin_ia32_pslldqi512_byteshift:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 29357eec2eeb6..e0ef052bed022 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -16581,6 +16581,52 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
return Success(APValue(RetMask), E);
}
+ case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
+ case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
+ case X86::BI__builtin_ia32_vpshufbitqmb512_mask: {
+ assert(E->getNumArgs() == 3);
+
+ APValue Source, ShuffleMask;
+ APSInt ZeroMask;
+ if (!EvaluateVector(E->getArg(0), Source, Info) ||
+ !EvaluateVector(E->getArg(1), ShuffleMask, Info) ||
+ !EvaluateInteger(E->getArg(2), ZeroMask, Info))
+ return false;
+
+ assert(Source.getVectorLength() == ShuffleMask.getVectorLength());
+ assert(ZeroMask.getBitWidth() == Source.getVectorLength());
+
+ unsigned NumBytesInQWord = 8;
+ unsigned NumBitsInByte = 8;
+ unsigned NumBytes = Source.getVectorLength();
+ unsigned NumQWords = NumBytes / NumBytesInQWord;
+ unsigned RetWidth = ZeroMask.getBitWidth();
+ APSInt RetMask(llvm::APInt(RetWidth, 0), /*isUnsigned=*/true);
+
+ for (unsigned QWordId = 0; QWordId != NumQWords; ++QWordId) {
+
+ APInt SourceQWord(64, 0);
+ for (unsigned ByteInQWord = 0; ByteInQWord != NumBytesInQWord;
+ ++ByteInQWord) {
+ uint64_t Byte =
+ Source.getVectorElt(QWordId * NumBytesInQWord + ByteInQWord)
+ .getInt()
+ .getZExtValue();
+ SourceQWord |= (Byte & 0xFF) << (ByteInQWord * NumBitsInByte);
+ }
+
+ for (unsigned ByteInQWord = 0; ByteInQWord != NumBytesInQWord;
+ ++ByteInQWord) {
+ unsigned ByteIdx = QWordId * NumBytesInQWord + ByteInQWord;
+ unsigned M =
+ ShuffleMask.getVectorElt(ByteIdx).getInt().getZExtValue() & 0x3F;
+ if (ZeroMask[ByteIdx]) {
+ RetMask.setBitVal(ByteIdx, SourceQWord[M]);
+ }
+ }
+ }
+ return Success(APValue(RetMask), E);
+ }
}
}
diff --git a/clang/lib/Headers/avx512bitalgintrin.h b/clang/lib/Headers/avx512bitalgintrin.h
index 98197e468370d..6a3e47814cb93 100644
--- a/clang/lib/Headers/avx512bitalgintrin.h
+++ b/clang/lib/Headers/avx512bitalgintrin.h
@@ -15,44 +15,43 @@
#define __AVX512BITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS \
+ constexpr \
+ __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512bitalg"), __min_vector_width__(512)))
+#else
#define __DEFAULT_FN_ATTRS \
__attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), \
__min_vector_width__(512)))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
-#else
-#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS
#endif
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm512_popcnt_epi16(__m512i __A) {
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi16(__m512i __A) {
return (__m512i)__builtin_elementwise_popcount((__v32hu)__A);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) {
return (__m512i)__builtin_ia32_selectw_512(
(__mmask32)__U, (__v32hi)_mm512_popcnt_epi16(__B), (__v32hi)__A);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) {
return _mm512_mask_popcnt_epi16((__m512i)_mm512_setzero_si512(), __U, __B);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
-_mm512_popcnt_epi8(__m512i __A) {
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi8(__m512i __A) {
return (__m512i)__builtin_elementwise_popcount((__v64qu)__A);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) {
return (__m512i)__builtin_ia32_selectb_512(
(__mmask64)__U, (__v64qi)_mm512_popcnt_epi8(__B), (__v64qi)__A);
}
-static __inline__ __m512i __DEFAULT_FN_ATTRS_CONSTEXPR
+static __inline__ __m512i __DEFAULT_FN_ATTRS
_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) {
return _mm512_mask_popcnt_epi8((__m512i)_mm512_setzero_si512(), __U, __B);
}
@@ -74,6 +73,4 @@ _mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B)
}
#undef __DEFAULT_FN_ATTRS
-#undef __DEFAULT_FN_ATTRS_CONSTEXPR
-
#endif
diff --git a/clang/lib/Headers/avx512vlbitalgintrin.h b/clang/lib/Headers/avx512vlbitalgintrin.h
index 1874adce4ea6e..624630f76e484 100644
--- a/clang/lib/Headers/avx512vlbitalgintrin.h
+++ b/clang/lib/Headers/avx512vlbitalgintrin.h
@@ -15,6 +15,16 @@
#define __AVX512VLBITALGINTRIN_H
/* Define the default attributes for the functions in this file. */
+#if defined(__cplusplus) && (__cplusplus >= 201103L)
+#define __DEFAULT_FN_ATTRS128 \
+ constexpr __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512vl,avx512bitalg"), \
+ __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ constexpr __attribute__((__always_inline__, __nodebug__, \
+ __target__("avx512vl,avx512bitalg"), \
+ __min_vector_width__(256)))
+#else
#define __DEFAULT_FN_ATTRS128 \
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bitalg"), \
@@ -23,75 +33,66 @@
__attribute__((__always_inline__, __nodebug__, \
__target__("avx512vl,avx512bitalg"), \
__min_vector_width__(256)))
-
-#if defined(__cplusplus) && (__cplusplus >= 201103L)
-#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128 constexpr
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256 constexpr
-#else
-#define __DEFAULT_FN_ATTRS128_CONSTEXPR __DEFAULT_FN_ATTRS128
-#define __DEFAULT_FN_ATTRS256_CONSTEXPR __DEFAULT_FN_ATTRS256
#endif
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi16(__m256i __A) {
return (__m256i)__builtin_elementwise_popcount((__v16hu)__A);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) {
return (__m256i)__builtin_ia32_selectw_256(
(__mmask16)__U, (__v16hi)_mm256_popcnt_epi16(__B), (__v16hi)__A);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) {
return _mm256_mask_popcnt_epi16((__m256i)_mm256_setzero_si256(), __U, __B);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_popcnt_epi16(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi16(__m128i __A) {
return (__m128i)__builtin_elementwise_popcount((__v8hu)__A);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) {
return (__m128i)__builtin_ia32_selectw_128(
(__mmask8)__U, (__v8hi)_mm_popcnt_epi16(__B), (__v8hi)__A);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) {
return _mm_mask_popcnt_epi16((__m128i)_mm_setzero_si128(), __U, __B);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_popcnt_epi8(__m256i __A) {
return (__m256i)__builtin_elementwise_popcount((__v32qu)__A);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) {
return (__m256i)__builtin_ia32_selectb_256(
(__mmask32)__U, (__v32qi)_mm256_popcnt_epi8(__B), (__v32qi)__A);
}
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) {
return _mm256_mask_popcnt_epi8((__m256i)_mm256_setzero_si256(), __U, __B);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_popcnt_epi8(__m128i __A) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_popcnt_epi8(__m128i __A) {
return (__m128i)__builtin_elementwise_popcount((__v16qu)__A);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) {
return (__m128i)__builtin_ia32_selectb_128(
(__mmask16)__U, (__v16qi)_mm_popcnt_epi8(__B), (__v16qi)__A);
}
-static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) {
return _mm_mask_popcnt_epi8((__m128i)_mm_setzero_si128(), __U, __B);
}
@@ -131,7 +132,4 @@ _mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B)
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
-#undef __DEFAULT_FN_ATTRS128_CONSTEXPR
-#undef __DEFAULT_FN_ATTRS256_CONSTEXPR
-
#endif
diff --git a/clang/test/CodeGen/X86/avx512bitalg-builtins.c b/clang/test/CodeGen/X86/avx512bitalg-builtins.c
index 3ac8674421d93..7d524ab156500 100644
--- a/clang/test/CodeGen/X86/avx512bitalg-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bitalg-builtins.c
@@ -70,4 +70,14 @@ __mmask64 test_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) {
// CHECK: @llvm.x86.avx512.vpshufbitqmb.512
return _mm512_bitshuffle_epi64_mask(__A, __B);
}
+TEST_CONSTEXPR(_mm512_bitshuffle_epi64_mask(
+ (__m512i)(__v64qi){1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128, -1,0,0,0,0,0,0,0, 85,85,85,85,85,85,85,85,
+ 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128, -1,0,0,0,0,0,0,0, 85,85,85,85,85,85,85,85},
+ (__m512i)(__v64qi){0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7,
+ 0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7}) == 0x55ff010155ff0101ULL);
+TEST_CONSTEXPR(_mm512_mask_bitshuffle_epi64_mask(0xFFFFFFFF00000000ULL,
+ (__m512i)(__v64qi){1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128, -1,0,0,0,0,0,0,0, 85,85,85,85,85,85,85,85,
+ 1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128, -1,0,0,0,0,0,0,0, 85,85,85,85,85,85,85,85},
+ (__m512i)(__v64qi){0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7,
+ 0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7}) == 0x55ff010100000000ULL);
diff --git a/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c b/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c
index e0b55c6fde81a..d19d2c28f3649 100644
--- a/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vlbitalg-builtins.c
@@ -116,6 +116,13 @@ __mmask32 test_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) {
// CHECK: @llvm.x86.avx512.vpshufbitqmb.256
return _mm256_bitshuffle_epi64_mask(__A, __B);
}
+TEST_CONSTEXPR(_mm256_bitshuffle_epi64_mask(
+ (__m256i)(__v32qi){1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128, -1,0,0,0,0,0,0,0, 85,85,85,85,85,85,85,85},
+ (__m256i)(__v32qi){0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7}) == 0x55ff0101);
+
+TEST_CONSTEXPR(_mm256_mask_bitshuffle_epi64_mask(0xFFFF0000,
+ (__m256i)(__v32qi){1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128, -1,0,0,0,0,0,0,0, 85,85,85,85,85,85,85,85},
+ (__m256i)(__v32qi){0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56, 0,1,2,3,4,5,6,7, 0,1,2,3,4,5,6,7}) == 0x55ff0000);
__mmask16 test_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B) {
// CHECK-LABEL: test_mm_mask_bitshuffle_epi64_mask
@@ -129,4 +136,11 @@ __mmask16 test_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) {
// CHECK: @llvm.x86.avx512.vpshufbitqmb.128
return _mm_bitshuffle_epi64_mask(__A, __B);
}
+TEST_CONSTEXPR(_mm_bitshuffle_epi64_mask(
+ (__m128i)(__v16qi){1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128},
+ (__m128i)(__v16qi){0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56}) == 0x0101);
+
+TEST_CONSTEXPR(_mm_mask_bitshuffle_epi64_mask(0xFF00,
+ (__m128i)(__v16qi){1,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,-128},
+ (__m128i)(__v16qi){0,1,2,3,4,5,6,7, 63,62,61,60,59,58,57,56}) == 0x0100);
|
|
@RKSimon tested this on my machine. Did fuzz test comparing compile time vs runtime results. Attached the markdown file with results. model name : AMD Ryzen 9 9950X3D 16-Core Processor
stepping : 0
microcode : 0xffffffff
cpu MHz : 4291.861
cache size : 1024 KB
physical id : 0
siblings : 32
core id : 15
cpu cores : 16
apicid : 31
initial apicid : 31
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid tsc_known_freq pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy svm cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 clzero xsaveerptr arat npt nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold v_vmsave_vmload avx512vbmi umip avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq rdpid fsrm avx512_vp2intersect
bugs : sysret_ss_attrs null_seg spectre_v1 spectre_v2 spec_store_bypass |
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Mainly a few style comments (applicable to both InterpBuiltin and ExprConstant implementations).
| #define __DEFAULT_FN_ATTRS \ | ||
| constexpr \ | ||
| __attribute__((__always_inline__, __nodebug__, \ | ||
| __target__("avx512bitalg"), __min_vector_width__(512))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(style) put the constexpr at the end of the attribute list and reformat
| #define __DEFAULT_FN_ATTRS256 \ | ||
| constexpr __attribute__((__always_inline__, __nodebug__, \ | ||
| __target__("avx512vl,avx512bitalg"), \ | ||
| __min_vector_width__(256))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(style) put the constexpr at the end of the attribute list and reformat
| APSInt RetMask(llvm::APInt(RetWidth, 0), /*isUnsigned=*/true); | ||
|
|
||
| for (unsigned QWordId = 0; QWordId != NumQWords; ++QWordId) { | ||
|
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(style) remove empty line
| for (unsigned QWordId = 0; QWordId != NumQWords; ++QWordId) { | ||
|
|
||
| APInt SourceQWord(64, 0); | ||
| for (unsigned ByteInQWord = 0; ByteInQWord != NumBytesInQWord; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ByteInQWord -> ByteIdx to help compact code?
| SourceQWord |= (Byte & 0xFF) << (ByteInQWord * NumBitsInByte); | ||
| } | ||
|
|
||
| for (unsigned ByteInQWord = 0; ByteInQWord != NumBytesInQWord; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ByteInQWord -> ByteIdx to help compact code?
| Byte = static_cast<uint64_t>( | ||
| Source.elem<T>(QWordId * NumBytesInQWord + ByteInQWord)); | ||
| }); | ||
| SourceQWord |= (Byte & 0xFF) << (ByteInQWord * NumBitsInByte); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use APInt insertBits?
| PrimType ShuffleMaskElemT = | ||
| *S.getContext().classify(ShuffleMaskVecT->getElementType()); | ||
|
|
||
| const unsigned NumBytesInQWord = 8; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove all the consts.
… AVX512 VPSHUFBITQMB intrinsics to be used in constexpr Resolves: llvm#161337
- Rename ByteInQWord to ByteIdx for loop counter within qword - Add SelIdx for absolute byte index used in mask selection - Use APInt::insertBits() instead of manual shift/OR for building SourceQWord - Remove unnecessary const from local variables - Move constexpr after __attribute__ in header macros - Minor formatting cleanup
8a8cc05 to
457e548
Compare
🐧 Linux x64 Test Results
|
RKSimon
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM - cheers
… AVX512 VPSHUFBITQMB intrinsics to be used in constexpr (llvm#168100) Resolves llvm#161337
… AVX512 VPSHUFBITQMB intrinsics to be used in constexpr (llvm#168100) Resolves llvm#161337
Resolves: #161337