-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[X86] EmitX86BuiltinExpr - attempt to convert SSE41/AVX1 roundps/d/ss/sd builtins to regular rounding modes #171227
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
|
@llvm/pr-subscribers-clang-codegen @llvm/pr-subscribers-backend-x86 Author: None (stomfaig) ChangesAdding clauses to I used:
Closes #170273 Full diff: https://github.com/llvm/llvm-project/pull/171227.diff 5 Files Affected:
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
index be2b7d442645e..c8b55e855e717 100644
--- a/clang/lib/CodeGen/TargetBuiltins/X86.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -75,6 +75,80 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
return MaskVec;
}
+static Value *emitX86Round(CodeGenFunction &CGF, Value *X, unsigned M) {
+ unsigned RoundingMask = 0b11;
+ unsigned UpdatePEBit = 0b100;
+ unsigned UseMXCSRBit = 0b1000;
+
+ unsigned roundingMode = M & RoundingMask;
+ bool updatePE = M & UpdatePEBit;
+ bool useMXCSR = M & UseMXCSRBit;
+
+ Intrinsic::ID ID = Intrinsic::not_intrinsic;
+ LLVMContext &Ctx = CGF.CGM.getLLVMContext();
+
+ if (useMXCSR) {
+ ID = Intrinsic::experimental_constrained_nearbyint;
+
+ auto PE_metatadata = updatePE ? "fpexcept.strict" : "fpexcept.ignore";
+
+ Value *ExceptMode =
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, PE_metatadata));
+
+ Value *RoundingMode =
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic"));
+
+ Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+ return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode});
+ }
+
+ if (updatePE) {
+ switch (roundingMode) {
+ case 0b00:
+ ID = Intrinsic::experimental_constrained_roundeven;
+ break;
+ case 0b01:
+ ID = Intrinsic::experimental_constrained_floor;
+ break;
+ case 0b10:
+ ID = Intrinsic::experimental_constrained_ceil;
+ break;
+ case 0b11:
+ ID = Intrinsic::experimental_constrained_trunc;
+ break;
+ default:
+ llvm_unreachable("Invalid rounding mode");
+ }
+
+ Value *ExceptMode =
+ MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.strict"));
+
+ Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+ return CGF.Builder.CreateCall(F, {X, ExceptMode});
+ }
+
+ // Otherwise we can use the standard ops
+ switch (roundingMode) {
+ case 0b00:
+ ID = Intrinsic::roundeven;
+ break;
+ case 0b01:
+ ID = Intrinsic::floor;
+ break;
+ case 0b10:
+ ID = Intrinsic::ceil;
+ break;
+ case 0b11:
+ ID = Intrinsic::trunc;
+ break;
+ default:
+ llvm_unreachable("Invalid rounding mode");
+ }
+
+ Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
+ return CGF.Builder.CreateCall(F, {X});
+}
+
static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
Align Alignment) {
Value *Ptr = Ops[0];
@@ -840,6 +914,23 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Ops[0]);
return Builder.CreateExtractValue(Call, 0);
}
+ case X86::BI__builtin_ia32_roundps:
+ case X86::BI__builtin_ia32_roundpd:
+ case X86::BI__builtin_ia32_roundps256:
+ case X86::BI__builtin_ia32_roundpd256: {
+ unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue();
+ return emitX86Round(*this, Ops[0], M);
+ }
+ case X86::BI__builtin_ia32_roundss:
+ case X86::BI__builtin_ia32_roundsd: {
+ unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue();
+
+ Value *idx = Builder.getInt32(0);
+ Value *ValAt0 = Builder.CreateExtractElement(Ops[1], idx);
+ Value *RoundedAt0 = emitX86Round(*this, ValAt0, M);
+
+ return Builder.CreateInsertElement(Ops[0], RoundedAt0, idx);
+ }
case X86::BI__builtin_ia32_lzcnt_u16:
case X86::BI__builtin_ia32_lzcnt_u32:
case X86::BI__builtin_ia32_lzcnt_u64: {
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 13da4292c5b92..f3844adf0a498 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -246,13 +246,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_castsi256_si128((__m256i)(__v4du){0xBFF0000000
__m256d test_mm256_ceil_pd(__m256d x) {
// CHECK-LABEL: test_mm256_ceil_pd
- // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
+ // CHECK: %{{.*}} = call <4 x double> @llvm.ceil.v4f64(<4 x double> %{{.*}})
return _mm256_ceil_pd(x);
}
__m256 test_mm_ceil_ps(__m256 x) {
// CHECK-LABEL: test_mm_ceil_ps
- // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2)
+ // CHECK: %{{.*}} = call <8 x float> @llvm.ceil.v8f32(<8 x float> %{{.*}})
return _mm256_ceil_ps(x);
}
@@ -1095,13 +1095,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0ULL, 1ULL, 2ULL,
__m256d test_mm256_floor_pd(__m256d x) {
// CHECK-LABEL: test_mm256_floor_pd
- // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <4 x double> @llvm.floor.v4f64(<4 x double> %{{.*}})
return _mm256_floor_pd(x);
}
__m256 test_mm_floor_ps(__m256 x) {
// CHECK-LABEL: test_mm_floor_ps
- // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <8 x float> @llvm.floor.v8f32(<8 x float> %{{.*}})
return _mm256_floor_ps(x);
}
@@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) {
__m256d test_mm256_round_pd(__m256d x) {
// CHECK-LABEL: test_mm256_round_pd
- // CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <4 x double> @llvm.experimental.constrained.roundeven.v4f64(<4 x double> %{{.*}}, metadata !"fpexcept.strict")
return _mm256_round_pd(x, 4);
}
__m256 test_mm256_round_ps(__m256 x) {
// CHECK-LABEL: test_mm256_round_ps
- // CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <8 x float> @llvm.experimental.constrained.roundeven.v8f32(<8 x float> %{{.*}}, metadata !"fpexcept.strict")
return _mm256_round_ps(x, 4);
}
diff --git a/clang/test/CodeGen/X86/pr51324.c b/clang/test/CodeGen/X86/pr51324.c
index 10d1ba3c84b85..de97183aa6613 100644
--- a/clang/test/CodeGen/X86/pr51324.c
+++ b/clang/test/CodeGen/X86/pr51324.c
@@ -9,7 +9,7 @@
// Make sure brackets work after macro intrinsics.
float pr51324(__m128 a) {
// CHECK-LABEL: pr51324
- // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
+ // call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{.*}})
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _mm_round_ps(a, 0)[0];
}
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 35fa65a99836b..f084e1dfade15 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -75,25 +75,29 @@ TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f}
__m128d test_mm_ceil_pd(__m128d x) {
// CHECK-LABEL: test_mm_ceil_pd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
+ // CHECK %{{.*}} = call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}})
return _mm_ceil_pd(x);
}
__m128 test_mm_ceil_ps(__m128 x) {
// CHECK-LABEL: test_mm_ceil_ps
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
+ // CHECK: %{{.*}} = call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}})
return _mm_ceil_ps(x);
}
__m128d test_mm_ceil_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_ceil_sd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
+ // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call double @llvm.ceil.f64(double %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_ceil_sd(x, y);
}
__m128 test_mm_ceil_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_ceil_ss
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
+ // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call float @llvm.ceil.f32(float %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_ceil_ss(x, y);
}
@@ -256,25 +260,29 @@ TEST_CONSTEXPR(_mm_extract_ps(((__m128){1.25f, 2.5f, 3.75f, 5.0f}), 6) == __buil
__m128d test_mm_floor_pd(__m128d x) {
// CHECK-LABEL: test_mm_floor_pd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <2 x double> @llvm.floor.v2f64(<2 x double> %{{.*}})
return _mm_floor_pd(x);
}
__m128 test_mm_floor_ps(__m128 x) {
// CHECK-LABEL: test_mm_floor_ps
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
+ // CHECK: %{{.*}} = call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
return _mm_floor_ps(x);
}
__m128d test_mm_floor_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_floor_sd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
+ // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call double @llvm.floor.f64(double %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_floor_sd(x, y);
}
__m128 test_mm_floor_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_floor_ss
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
+ // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call float @llvm.floor.f32(float %[[A:.*]])
+ // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_floor_ss(x, y);
}
@@ -430,25 +438,29 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276
__m128d test_mm_round_pd(__m128d x) {
// CHECK-LABEL: test_mm_round_pd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <2 x double> @llvm.experimental.constrained.roundeven.v2f64(<2 x double> %{{.*}}, metadata !"fpexcept.strict")
return _mm_round_pd(x, 4);
}
__m128 test_mm_round_ps(__m128 x) {
// CHECK-LABEL: test_mm_round_ps
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
+ // CHECK: %{{.*}} = call <4 x float> @llvm.experimental.constrained.roundeven.v4f32(<4 x float> %{{.*}}, metadata !"fpexcept.strict")
return _mm_round_ps(x, 4);
}
__m128d test_mm_round_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_round_sd
- // CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4)
+ // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call double @llvm.experimental.constrained.roundeven.f64(double %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_round_sd(x, y, 4);
}
__m128 test_mm_round_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_round_ss
- // CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4)
+ // CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
+ // CHECK: %[[B:.*]] = call float @llvm.experimental.constrained.roundeven.f32(float %[[A:.*]], metadata !"fpexcept.strict")
+ // CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_round_ss(x, y, 4);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index 1dd23f60c7e1e..7838e410badd7 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -626,18 +626,20 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
// FP rounding ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
- def int_x86_sse41_round_ss : ClangBuiltin<"__builtin_ia32_roundss">,
- DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
- def int_x86_sse41_round_ps : ClangBuiltin<"__builtin_ia32_roundps">,
- DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
- def int_x86_sse41_round_sd : ClangBuiltin<"__builtin_ia32_roundsd">,
- DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
- def int_x86_sse41_round_pd : ClangBuiltin<"__builtin_ia32_roundpd">,
- DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
- llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_sse41_round_ss
+ : DefaultAttrsIntrinsic<[llvm_v4f32_ty],
+ [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+ def int_x86_sse41_round_ps
+ : DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_sse41_round_sd
+ : DefaultAttrsIntrinsic<[llvm_v2f64_ty],
+ [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+ def int_x86_sse41_round_pd
+ : DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
}
// Vector min element
@@ -921,12 +923,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">,
DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
- def int_x86_avx_round_pd_256 : ClangBuiltin<"__builtin_ia32_roundpd256">,
- DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<1>>]>;
- def int_x86_avx_round_ps_256 : ClangBuiltin<"__builtin_ia32_roundps256">,
- DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
- [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_avx_round_pd_256
+ : DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
+ def int_x86_avx_round_ps_256
+ : DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
+ [IntrNoMem, ImmArg<ArgIndex<1>>]>;
}
// Horizontal ops
|
|
cc: @RKSimon |
| if (updatePE) { | ||
| switch (roundingMode) { | ||
| case 0b00: | ||
| ID = Intrinsic::experimental_constrained_roundeven; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why we have to use constrained intrinsics here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The llvm.roundeven does not raise floating point exceptions. But I might have missed something?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't care of exceptions in default FP mode. It's correctly handled only in strict FP mode.
You could use this syntax. |
| // CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0 | ||
| // CHECK: %[[B:.*]] = call double @llvm.ceil.f64(double %[[A:.*]]) | ||
| // CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if the backend combines them to the same instruction. Did you check we can get the same assemble?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You're right, we do not get the same instruction. We get:
roundss xmm1, xmm1, 10
blendps xmm0, xmm1, 1I can either try to implement a pattern to convert this to a single roundss/p or lower directly to X86 here. Which one do you think is better?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think pattern matching is worth a try. It can be done independently.
|
Commit message should be updated. |
Adding clauses to
CodeGenFunction::EmitX86BuiltinExprto convert SSE4.1/AVX1 builtsroundps/pd/ss/sdto regular rounding modes.I used:
roundeven/floor/ceil/truncwhen PE is not set and not using MXCSRexperimental_constrained_roundeven/floor/ceil/truncwhen setting PE and not using MXCSRexperimental_constrained_nearbyintwhen setting MXCSRCloses #170273