-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[Headers][X86] Allow MMX/SSE/AVX MOVMSK intrinsics to be used in constexpr #161914
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-x86 @llvm/pr-subscribers-clang Author: Shawn K (kimsh02) ChangesFix #154520 Full diff: https://github.com/llvm/llvm-project/pull/161914.diff 12 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index e98bee28c15be..1387c6a18f500 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -182,7 +182,8 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def cvttss2si : X86Builtin<"int(_Vector<4, float>)">;
}
-let Features = "sse", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
+let Features = "sse",
+ Attributes = [NoThrow, Constexpr, RequiredVectorWidth<128>] in {
def movmskps : X86Builtin<"int(_Vector<4, float>)">;
}
@@ -208,7 +209,8 @@ let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def maskmovdqu : X86Builtin<"void(_Vector<16, char>, _Vector<16, char>, char *)">;
}
-let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "sse2",
+ Attributes = [NoThrow, Constexpr, RequiredVectorWidth<128>] in {
def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
}
@@ -523,6 +525,12 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
def vtestnzcps : X86Builtin<"int(_Vector<4, float>, _Vector<4, float>)">;
}
+let Features = "avx",
+ Attributes = [NoThrow, Constexpr, RequiredVectorWidth<256>] in {
+ def movmskpd256 : X86Builtin<"int(_Vector<4, double>)">;
+ def movmskps256 : X86Builtin<"int(_Vector<8, float>)">;
+}
+
let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vtestzpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
def vtestcpd256 : X86Builtin<"int(_Vector<4, double>, _Vector<4, double>)">;
@@ -532,9 +540,8 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vtestnzcps256 : X86Builtin<"int(_Vector<8, float>, _Vector<8, float>)">;
def ptestz256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
def ptestc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
- def ptestnzc256 : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
- def movmskpd256 : X86Builtin<"int(_Vector<4, double>)">;
- def movmskps256 : X86Builtin<"int(_Vector<8, float>)">;
+ def ptestnzc256
+ : X86Builtin<"int(_Vector<4, long long int>, _Vector<4, long long int>)">;
}
let Features = "avx", Attributes = [NoThrow] in {
@@ -569,6 +576,11 @@ let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in
def vec_set_v8si : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int, _Constant int)">;
}
+let Features = "avx2",
+ Attributes = [NoThrow, Constexpr, RequiredVectorWidth<256>] in {
+ def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
+}
+
let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant int)">;
@@ -579,8 +591,8 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def pmaddubsw256 : X86Builtin<"_Vector<16, short>(_Vector<32, char>, _Vector<32, char>)">;
- def pmaddwd256 : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
- def pmovmskb256 : X86Builtin<"int(_Vector<32, char>)">;
+ def pmaddwd256
+ : X86Builtin<"_Vector<8, int>(_Vector<16, short>, _Vector<16, short>)">;
def pmulhrsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
def pshufb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index a2e97fcafdfef..ed36877d070e3 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -2773,6 +2773,46 @@ static bool interp__builtin_blend(InterpState &S, CodePtr OpPC,
return true;
}
+static bool interp__builtin_ia32_movmsk_op(InterpState &S, CodePtr OpPC,
+ const CallExpr *Call) {
+ assert(Call->getNumArgs() == 1);
+
+ const Pointer &Source = S.Stk.pop<Pointer>();
+
+ unsigned SourceLen = Source.getNumElems();
+ const QualType ElemQT = getElemType(Source);
+ const OptPrimType ElemPT = S.getContext().classify(ElemQT);
+ unsigned LaneWidth = S.getASTContext().getTypeSize(ElemQT);
+
+ if (ElemQT->isIntegerType()) {
+ unsigned Byte = 8;
+ unsigned ResultLen = (LaneWidth * SourceLen) / Byte;
+ APInt Result(ResultLen, 0);
+ unsigned ResultIdx = 0;
+ for (unsigned I = 0; I != SourceLen; ++I) {
+ APInt Lane;
+ INT_TYPE_SWITCH_NO_BOOL(*ElemPT,
+ { Lane = Source.elem<T>(I).toAPSInt(); });
+ for (unsigned J = 0; J != LaneWidth; J += Byte) {
+ Result.setBitVal(ResultIdx++, Lane[J + 7]);
+ }
+ }
+ pushInteger(S, Result.getZExtValue(), Call->getType());
+ return true;
+ }
+ if (ElemQT->isFloatingType()) {
+ APInt Result(SourceLen, 0);
+ using T = PrimConv<PT_Float>::T;
+ for (unsigned I = 0; I != SourceLen; ++I) {
+ APInt Lane = Source.elem<T>(I).getAPFloat().bitcastToAPInt();
+ Result.setBitVal(I, Lane[LaneWidth - 1]);
+ }
+ pushInteger(S, Result.getZExtValue(), Call->getType());
+ return true;
+ }
+ return false;
+}
+
static bool interp__builtin_elementwise_triop(
InterpState &S, CodePtr OpPC, const CallExpr *Call,
llvm::function_ref<APInt(const APSInt &, const APSInt &, const APSInt &)>
@@ -3355,6 +3395,15 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
return LHS.isSigned() ? LHS.ssub_sat(RHS) : LHS.usub_sat(RHS);
});
+ case clang::X86::BI__builtin_ia32_movmskps:
+ case clang::X86::BI__builtin_ia32_movmskpd:
+ case clang::X86::BI__builtin_ia32_pmovmskb128:
+ case clang::X86::BI__builtin_ia32_pmovmskb256:
+ case clang::X86::BI__builtin_ia32_movmskps256:
+ case clang::X86::BI__builtin_ia32_movmskpd256: {
+ return interp__builtin_ia32_movmsk_op(S, OpPC, Call);
+ }
+
case clang::X86::BI__builtin_ia32_pavgb128:
case clang::X86::BI__builtin_ia32_pavgw128:
case clang::X86::BI__builtin_ia32_pavgb256:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index b706b14945b6d..0926fbc415ce7 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -13650,7 +13650,6 @@ static bool getBuiltinAlignArguments(const CallExpr *E, EvalInfo &Info,
bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
unsigned BuiltinOp) {
-
auto HandleMaskBinOp =
[&](llvm::function_ref<APSInt(const APSInt &, const APSInt &)> Fn)
-> bool {
@@ -14679,6 +14678,44 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
return Success(CarryOut, E);
}
+ case clang::X86::BI__builtin_ia32_movmskps:
+ case clang::X86::BI__builtin_ia32_movmskpd:
+ case clang::X86::BI__builtin_ia32_pmovmskb128:
+ case clang::X86::BI__builtin_ia32_pmovmskb256:
+ case clang::X86::BI__builtin_ia32_movmskps256:
+ case clang::X86::BI__builtin_ia32_movmskpd256: {
+ APValue Source;
+ if (!Evaluate(Source, Info, E->getArg(0)))
+ return false;
+ unsigned SourceLen = Source.getVectorLength();
+ const VectorType *VT = E->getArg(0)->getType()->castAs<VectorType>();
+ const QualType ElemQT = VT->getElementType();
+ unsigned LaneWidth = Info.Ctx.getTypeSize(ElemQT);
+
+ if (ElemQT->isIntegerType()) { // Get MSB of each byte of every lane
+ unsigned Byte = 8;
+ unsigned ResultLen = (LaneWidth * SourceLen) / Byte;
+ APInt Result(ResultLen, 0);
+ unsigned ResultIdx = 0;
+ for (unsigned I = 0; I != SourceLen; ++I) {
+ APInt Lane = Source.getVectorElt(I).getInt();
+ for (unsigned J = 0; J != LaneWidth; J += Byte) {
+ Result.setBitVal(ResultIdx++, Lane[J + 7]);
+ }
+ }
+ return Success(Result.getZExtValue(), E);
+ }
+ if (ElemQT->isFloatingType()) { // Get sign bit of every lane
+ APInt Result(SourceLen, 0);
+ for (unsigned I = 0; I != SourceLen; ++I) {
+ APInt Lane = Source.getVectorElt(I).getFloat().bitcastToAPInt();
+ Result.setBitVal(I, Lane[LaneWidth - 1]);
+ }
+ return Success(Result.getZExtValue(), E);
+ }
+ return false;
+ }
+
case clang::X86::BI__builtin_ia32_bextr_u32:
case clang::X86::BI__builtin_ia32_bextr_u64:
case clang::X86::BI__builtin_ia32_bextri_u32:
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 31759c5386d9f..133def7b496ec 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1306,9 +1306,8 @@ _mm256_min_epu32(__m256i __a, __m256i __b) {
/// \param __a
/// A 256-bit integer vector containing the source bytes.
/// \returns The 32-bit integer mask.
-static __inline__ int __DEFAULT_FN_ATTRS256
-_mm256_movemask_epi8(__m256i __a)
-{
+static __inline__ int __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_movemask_epi8(__m256i __a) {
return __builtin_ia32_pmovmskb256((__v32qi)__a);
}
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index d6ba19a6c78af..60c6f7a44a323 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -2960,9 +2960,8 @@ _mm256_testnzc_si256(__m256i __a, __m256i __b)
/// A 256-bit vector of [4 x double] containing the double-precision
/// floating point values with sign bits to be extracted.
/// \returns The sign bits from the operand, written to bits [3:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_pd(__m256d __a)
-{
+static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_movemask_pd(__m256d __a) {
return __builtin_ia32_movmskpd256((__v4df)__a);
}
@@ -2978,9 +2977,8 @@ _mm256_movemask_pd(__m256d __a)
/// A 256-bit vector of [8 x float] containing the single-precision floating
/// point values with sign bits to be extracted.
/// \returns The sign bits from the operand, written to bits [7:0].
-static __inline int __DEFAULT_FN_ATTRS
-_mm256_movemask_ps(__m256 __a)
-{
+static __inline int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm256_movemask_ps(__m256 __a) {
return __builtin_ia32_movmskps256((__v8sf)__a);
}
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 6597e7e7d4030..11ba0919152e8 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -4280,7 +4280,8 @@ _mm_packus_epi16(__m128i __a, __m128i __b) {
/// A 128-bit integer vector containing the values with bits to be extracted.
/// \returns The most significant bits from each 8-bit element in \a __a,
/// written to bits [15:0]. The other bits are assigned zeros.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movemask_epi8(__m128i __a) {
return __builtin_ia32_pmovmskb128((__v16qi)__a);
}
@@ -4699,7 +4700,8 @@ _mm_unpacklo_pd(__m128d __a, __m128d __b) {
/// be extracted.
/// \returns The sign bits from each of the double-precision elements in \a __a,
/// written to bits [1:0]. The remaining bits are assigned values of zero.
-static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_movemask_pd(__m128d __a) {
return __builtin_ia32_movmskpd((__v2df)__a);
}
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index d876b4735a7d2..fe6afdcfc3fdb 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -2416,9 +2416,8 @@ _mm_min_pu8(__m64 __a, __m64 __b) {
/// A 64-bit integer vector containing the values with bits to be extracted.
/// \returns The most significant bit from each 8-bit element in \a __a,
/// written to bits [7:0].
-static __inline__ int __DEFAULT_FN_ATTRS_SSE2
-_mm_movemask_pi8(__m64 __a)
-{
+static __inline__ int __DEFAULT_FN_ATTRS_SSE2_CONSTEXPR
+_mm_movemask_pi8(__m64 __a) {
return __builtin_ia32_pmovmskb128((__v16qi)__zext128(__a));
}
@@ -3015,9 +3014,7 @@ _mm_cvtps_pi8(__m128 __a)
/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
/// single-precision floating-point element of the parameter. Bits [31:4] are
/// set to zero.
-static __inline__ int __DEFAULT_FN_ATTRS
-_mm_movemask_ps(__m128 __a)
-{
+static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR _mm_movemask_ps(__m128 __a) {
return __builtin_ia32_movmskps((__v4sf)__a);
}
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 3018bb9719b89..13ea5be32a5eb 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1329,12 +1329,16 @@ int test_mm256_movemask_pd(__m256d A) {
// CHECK: call {{.*}}i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %{{.*}})
return _mm256_movemask_pd(A);
}
+TEST_CONSTEXPR(_mm256_movemask_pd((__m256d)(__v4df){-1234.5678901234, 98765.4321098765, 0.000123456789, -3.14159265358979}) == 0x9);
+TEST_CONSTEXPR(_mm256_movemask_pd((__m256d)(__v4df){-0.000000987654321, -99999.999999999, 42.424242424242, 314159.2653589793}) == 0x3);
int test_mm256_movemask_ps(__m256 A) {
// CHECK-LABEL: test_mm256_movemask_ps
// CHECK: call {{.*}}i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %{{.*}})
return _mm256_movemask_ps(A);
}
+TEST_CONSTEXPR(_mm256_movemask_ps((__m256)(__v8sf){-12.3456f, 34.7890f, -0.0001234f, 123456.78f, -987.654f, 0.001234f, 3.14159f, -256.001f}) == 0x95);
+TEST_CONSTEXPR(_mm256_movemask_ps((__m256)(__v8sf){0.333333f, -45.6789f, 999.999f, -0.9999f, 17.234f, -128.512f, 2048.0f, -3.14f}) == 0xAA);
__m256d test_mm256_mul_pd(__m256d A, __m256d B) {
// CHECK-LABEL: test_mm256_mul_pd
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index eff2797e87c75..943881b8d599d 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -966,6 +966,9 @@ int test_mm256_movemask_epi8(__m256i a) {
// CHECK: call {{.*}}i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %{{.*}})
return _mm256_movemask_epi8(a);
}
+TEST_CONSTEXPR(_mm256_movemask_epi8((__m256i)(__v32qu){0x7F,0x80,0x01,0xFF,0x00,0xAA,0x55,0xC3,0x12,0x8E,0x00,0xFE,0x7E,0x81,0xFF,0x01,0xB6,0x00,0x39,0x40,0xD0,0x05,0x80,0x2A,0x7B,0x00,0x90,0xFF,0x01,0x34,0xC0,0x6D}) == 0x4C516AAA);
+TEST_CONSTEXPR(_mm256_movemask_epi8((__m256i)(__v8si){(int)0x80FF00AA,(int)0x7F0183E1,(int)0xDEADBEEF,(int)0xC0000001,(int)0x00000000,(int)0xFFFFFFFF,(int)0x12345678,(int)0x90ABCDEF}) == 0xF0F08F3D);
+TEST_CONSTEXPR(_mm256_movemask_epi8((__m256i)(__v4du){0xFF00000000000080ULL,0x7F010203040506C3ULL,0x8000000000000000ULL,0x0123456789ABCDEFULL}) == 0x0F800181);
__m256i test_mm256_mpsadbw_epu8(__m256i x, __m256i y) {
// CHECK-LABEL: test_mm256_mpsadbw_epu8
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 26c5f7315457e..23e0b17236966 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -399,6 +399,10 @@ int test_mm_movemask_pi8(__m64 a) {
// CHECK: call {{.*}}i32 @llvm.x86.sse2.pmovmskb.128(
return _mm_movemask_pi8(a);
}
+TEST_CONSTEXPR(_mm_movemask_pi8((__m64)((__v8qu){0x7F,0x80,0x01,0xFF,0x00,0xAA,0x55,0xC3})) == 0xAA);
+TEST_CONSTEXPR(_mm_movemask_pi8((__m64)((__v2si){(int)0x80FF00AA,(int)0x7F0183E1})) == 0x3D);
+TEST_CONSTEXPR(_mm_movemask_pi8((__m64)((__v1di){(long long)0xE110837A00924DB0ULL})) == 0xA5);
+
__m64 test_mm_mul_su32(__m64 a, __m64 b) {
// CHECK-LABEL: test_mm_mul_su32
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 3bad3426b1586..f5c1d00d1b851 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -561,6 +561,8 @@ int test_mm_movemask_ps(__m128 A) {
// CHECK: call {{.*}}i32 @llvm.x86.sse.movmsk.ps(<4 x float> %{{.*}})
return _mm_movemask_ps(A);
}
+TEST_CONSTEXPR(_mm_movemask_ps((__m128)(__v4sf){-2.0f, 3.0f, -5.5f, -0.0f}) == 0xD);
+TEST_CONSTEXPR(_mm_movemask_ps((__m128)(__v4sf){-7.348215e5, 0.00314159, -12.789, 2.7182818}) == 0x5);
__m128 test_mm_mul_ps(__m128 A, __m128 B) {
// CHECK-LABEL: test_mm_mul_ps
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 84b90c09444c2..698686038b32f 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -953,12 +953,17 @@ int test_mm_movemask_epi8(__m128i A) {
// CHECK: call {{.*}}i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
return _mm_movemask_epi8(A);
}
+TEST_CONSTEXPR(_mm_movemask_epi8((__m128i)(__v16qu){0x7F,0x80,0x01,0xFF,0x00,0xAA,0x55,0xC3,0x12,0x8E,0x00,0xFE,0x7E,0x81,0xFF,0x01}) == 0x6AAA);
+TEST_CONSTEXPR(_mm_movemask_epi8((__m128i)(__v4si){(int)0x80FF00AA,(int)0x7F0183E1,(int)0xDEADBEEF,(int)0xC0000001}) == 0x8F3D);
+TEST_CONSTEXPR(_mm_movemask_epi8((__m128i)(__v2du){0xFF00000000000080ULL,0x7F010203040506C3ULL}) == 0x181);
int test_mm_movemask_pd(__m128d A) {
// CHECK-LABEL: test_mm_movemask_pd
// CHECK: call {{.*}}i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %{{.*}})
return _mm_movemask_pd(A);
}
+TEST_CONSTEXPR(_mm_movemask_pd((__m128d)(__v2df){-12345.67890123, 4567.89012345}) == 0x1);
+TEST_CONSTEXPR(_mm_movemask_pd((__m128d)(__v2df){0.0000987654321, 09876.5432109876}) == 0x0);
__m128i test_mm_mul_epu32(__m128i A, __m128i B) {
// CHECK-LABEL: test_mm_mul_epu32
|
e269538 to
79c747e
Compare
| { Lane = Source.elem<T>(I).toAPSInt(); }); | ||
| for (unsigned J = 0; J != LaneWidth; J += Byte) { | ||
| Result.setBitVal(ResultIdx++, Lane[J + 7]); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why can't this be merged into the INT_TYPE_SWITCH_NO_BOOL?
Using APInt::isNegative() might be cleaner than direct bit access.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not understanding the intent for APInt::isNegative() here. I get the MSB of every byte.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IsNegative will do the MSB extraction for us, it's a lot cleaner than CHAR_BIT bit twiddling and we're already using it for select/BLENDV folding
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Calling Lane.isNegative() gets the MSB for the lane, not for every byte of the lane. I applied isNegative() for the floating type case however. 👍
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MOVMSK needs to work per element, not per byte - PMOVMSKB has a vXi8 type so each element happens to be byte size, but all the ElemBitWidth/BitsInAByte code is unnecessary
|
Apply feedback 👍 |
|
@RKSimon My PR could take a look 🫡 |
|
@RKSimon Fixed rebase |
488ac70 to
08fcf97
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
| APInt Lane = Source.elem<T>(I).toAPSInt(); | ||
| for (unsigned J = 0; J != LaneWidth; J += BitsInAByte) { | ||
| Result.setBitVal(ResultIdx++, Lane[J + 7]); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This for loop is unnecessary as LaneWidth == 8:
APInt Lane = Source.elem<T>(I).toAPSInt();
Result.setBitVal(I, Lane.isNegative());
clang/lib/AST/ExprConstant.cpp
Outdated
| APInt Lane = Source.getVectorElt(I).getInt(); | ||
| for (unsigned J = 0; J != LaneWidth; J += BitsInAByte) { | ||
| Result.setBitVal(ResultIdx++, Lane[J + 7]); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
APInt Lane = Source.getVectorElt(I).getInt();
Result.setBitVal(ResultIdx++,Lane.isNegative());
ad28b60 to
ccc7f4e
Compare
|
@RKSimon I realize it's probably helpful to be able to look back at the past feedback you put down for my PR, so I unresolved the threads for the issue you brought up twice 👍 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping
| { Lane = Source.elem<T>(I).toAPSInt(); }); | ||
| for (unsigned J = 0; J != LaneWidth; J += Byte) { | ||
| Result.setBitVal(ResultIdx++, Lane[J + 7]); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
MOVMSK needs to work per element, not per byte - PMOVMSKB has a vXi8 type so each element happens to be byte size, but all the ElemBitWidth/BitsInAByte code is unnecessary
e9dda4d to
831dafd
Compare
|
There doesn't seem to be a build artifact for Linux AArch64 so I'll assume my PR is passing? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Fix #154520