Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions clang/lib/CodeGen/TargetBuiltins/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,52 @@ static Value *getMaskVecValue(CodeGenFunction &CGF, Value *Mask,
return MaskVec;
}

// Emit rounding for the value X according to the rounding RoundingControl.
static Value *emitX86Round(CodeGenFunction &CGF, Value *X,
unsigned RoundingControl) {
unsigned roundingMask = 0b11;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(style) capitalize first letter of variable (same for others)

Suggested change
unsigned roundingMask = 0b11;
unsigned RoundingMask = 0b11;

unsigned useMXCSRBit = 0b1000;

unsigned roundingMode = RoundingControl & roundingMask;
bool useMXCSR = RoundingControl & useMXCSRBit;

Intrinsic::ID ID = Intrinsic::not_intrinsic;
LLVMContext &Ctx = CGF.CGM.getLLVMContext();

if (useMXCSR) {
ID = Intrinsic::experimental_constrained_nearbyint;

Value *ExceptMode =
MetadataAsValue::get(Ctx, MDString::get(Ctx, "fpexcept.ignore"));

Value *RoundingMode =
MetadataAsValue::get(Ctx, MDString::get(Ctx, "rounding.dynamic"));

Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
return CGF.Builder.CreateCall(F, {X, ExceptMode, RoundingMode});
}

switch (roundingMode) {
case 0b00:
ID = Intrinsic::roundeven;
break;
case 0b01:
ID = Intrinsic::floor;
break;
case 0b10:
ID = Intrinsic::ceil;
break;
case 0b11:
ID = Intrinsic::trunc;
break;
default:
llvm_unreachable("Invalid rounding mode");
}

Function *F = CGF.CGM.getIntrinsic(ID, X->getType());
return CGF.Builder.CreateCall(F, {X});
}

static Value *EmitX86MaskedStore(CodeGenFunction &CGF, ArrayRef<Value *> Ops,
Align Alignment) {
Value *Ptr = Ops[0];
Expand Down Expand Up @@ -840,6 +886,23 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
Ops[0]);
return Builder.CreateExtractValue(Call, 0);
}
case X86::BI__builtin_ia32_roundps:
case X86::BI__builtin_ia32_roundpd:
case X86::BI__builtin_ia32_roundps256:
case X86::BI__builtin_ia32_roundpd256: {
unsigned M = cast<ConstantInt>(Ops[1])->getZExtValue();
return emitX86Round(*this, Ops[0], M);
}
case X86::BI__builtin_ia32_roundss:
case X86::BI__builtin_ia32_roundsd: {
unsigned M = cast<ConstantInt>(Ops[2])->getZExtValue();

Value *idx = Builder.getInt32(0);
Value *ValAt0 = Builder.CreateExtractElement(Ops[1], idx);
Value *RoundedAt0 = emitX86Round(*this, ValAt0, M);

return Builder.CreateInsertElement(Ops[0], RoundedAt0, idx);
}
case X86::BI__builtin_ia32_lzcnt_u16:
case X86::BI__builtin_ia32_lzcnt_u32:
case X86::BI__builtin_ia32_lzcnt_u64: {
Expand Down
12 changes: 6 additions & 6 deletions clang/test/CodeGen/X86/avx-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -246,13 +246,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_castsi256_si128((__m256i)(__v4du){0xBFF0000000

__m256d test_mm256_ceil_pd(__m256d x) {
// CHECK-LABEL: test_mm256_ceil_pd
// CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2)
// CHECK: %{{.*}} = call <4 x double> @llvm.ceil.v4f64(<4 x double> %{{.*}})
return _mm256_ceil_pd(x);
}

__m256 test_mm_ceil_ps(__m256 x) {
// CHECK-LABEL: test_mm_ceil_ps
// CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2)
// CHECK: %{{.*}} = call <8 x float> @llvm.ceil.v8f32(<8 x float> %{{.*}})
return _mm256_ceil_ps(x);
}

Expand Down Expand Up @@ -1095,13 +1095,13 @@ TEST_CONSTEXPR(match_m128i(_mm256_extractf128_si256(((__m256i){0ULL, 1ULL, 2ULL,

__m256d test_mm256_floor_pd(__m256d x) {
// CHECK-LABEL: test_mm256_floor_pd
// CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1)
// CHECK: %{{.*}} = call <4 x double> @llvm.floor.v4f64(<4 x double> %{{.*}})
return _mm256_floor_pd(x);
}

__m256 test_mm_floor_ps(__m256 x) {
// CHECK-LABEL: test_mm_floor_ps
// CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1)
// CHECK: %{{.*}} = call <8 x float> @llvm.floor.v8f32(<8 x float> %{{.*}})
return _mm256_floor_ps(x);
}

Expand Down Expand Up @@ -1511,13 +1511,13 @@ __m256 test_mm256_rcp_ps(__m256 A) {

__m256d test_mm256_round_pd(__m256d x) {
// CHECK-LABEL: test_mm256_round_pd
// CHECK: call {{.*}}<4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 4)
// CHECK: %{{.*}} = call <4 x double> @llvm.roundeven.v4f64(<4 x double> %{{.*}})
return _mm256_round_pd(x, 4);
}

__m256 test_mm256_round_ps(__m256 x) {
// CHECK-LABEL: test_mm256_round_ps
// CHECK: call {{.*}}<8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 4)
// CHECK: %{{.*}} = call <8 x float> @llvm.roundeven.v8f32(<8 x float> %{{.*}})
return _mm256_round_ps(x, 4);
}

Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGen/X86/pr51324.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
// Make sure brackets work after macro intrinsics.
float pr51324(__m128 a) {
// CHECK-LABEL: pr51324
// CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 0)
// call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{.*}})
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _mm_round_ps(a, 0)[0];
}
36 changes: 24 additions & 12 deletions clang/test/CodeGen/X86/sse41-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,25 +75,29 @@ TEST_CONSTEXPR(match_m128(_mm_blendv_ps((__m128)(__v4sf){0.0f, 1.0f, 2.0f, 3.0f}

__m128d test_mm_ceil_pd(__m128d x) {
// CHECK-LABEL: test_mm_ceil_pd
// CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2)
// CHECK %{{.*}} = call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{.*}})
return _mm_ceil_pd(x);
}

__m128 test_mm_ceil_ps(__m128 x) {
// CHECK-LABEL: test_mm_ceil_ps
// CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2)
// CHECK: %{{.*}} = call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{.*}})
return _mm_ceil_ps(x);
}

__m128d test_mm_ceil_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_ceil_sd
// CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2)
// CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
// CHECK: %[[B:.*]] = call double @llvm.ceil.f64(double %[[A:.*]])
// CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
Comment on lines +90 to +92
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if the backend combines them to the same instruction. Did you check we can get the same assemble?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, we do not get the same instruction. We get:

roundss xmm1, xmm1, 10
blendps xmm0, xmm1, 1

I can either try to implement a pattern to convert this to a single roundss/p or lower directly to X86 here. Which one do you think is better?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think pattern matching is worth a try. It can be done independently.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, did you build it with O0? It's not a problem if it's only different on O0.

Copy link
Contributor Author

@stomfaig stomfaig Dec 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried O1, O3 also, it is different on both of those.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we have similar patterns in place for sqrtss\d (search for scalar_unary_math_patterns in X86InstrSSE.td)

return _mm_ceil_sd(x, y);
}

__m128 test_mm_ceil_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_ceil_ss
// CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2)
// CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
// CHECK: %[[B:.*]] = call float @llvm.ceil.f32(float %[[A:.*]])
// CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_ceil_ss(x, y);
}

Expand Down Expand Up @@ -256,25 +260,29 @@ TEST_CONSTEXPR(_mm_extract_ps(((__m128){1.25f, 2.5f, 3.75f, 5.0f}), 6) == __buil

__m128d test_mm_floor_pd(__m128d x) {
// CHECK-LABEL: test_mm_floor_pd
// CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1)
// CHECK: %{{.*}} = call <2 x double> @llvm.floor.v2f64(<2 x double> %{{.*}})
return _mm_floor_pd(x);
}

__m128 test_mm_floor_ps(__m128 x) {
// CHECK-LABEL: test_mm_floor_ps
// CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1)
// CHECK: %{{.*}} = call <4 x float> @llvm.floor.v4f32(<4 x float> %{{.*}})
return _mm_floor_ps(x);
}

__m128d test_mm_floor_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_floor_sd
// CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1)
// CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
// CHECK: %[[B:.*]] = call double @llvm.floor.f64(double %[[A:.*]])
// CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_floor_sd(x, y);
}

__m128 test_mm_floor_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_floor_ss
// CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1)
// CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
// CHECK: %[[B:.*]] = call float @llvm.floor.f32(float %[[A:.*]])
// CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_floor_ss(x, y);
}

Expand Down Expand Up @@ -430,25 +438,29 @@ TEST_CONSTEXPR(match_v8hi(_mm_packus_epi32((__m128i)(__v4si){40000, -50000, 3276

__m128d test_mm_round_pd(__m128d x) {
// CHECK-LABEL: test_mm_round_pd
// CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 4)
// CHECK: %{{.*}} = call <2 x double> @llvm.roundeven.v2f64(<2 x double> %{{.*}})
return _mm_round_pd(x, 4);
}

__m128 test_mm_round_ps(__m128 x) {
// CHECK-LABEL: test_mm_round_ps
// CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
// CHECK: %{{.*}} = call <4 x float> @llvm.roundeven.v4f32(<4 x float> %{{.*}})
return _mm_round_ps(x, 4);
}

__m128d test_mm_round_sd(__m128d x, __m128d y) {
// CHECK-LABEL: test_mm_round_sd
// CHECK: call {{.*}}<2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 4)
// CHECK: %[[A:.*]] = extractelement <2 x double> %{{.*}}, i32 0
// CHECK: %[[B:.*]] = call double @llvm.roundeven.f64(double %[[A:.*]])
// CHECK: %{{.*}} = insertelement <2 x double> %0, double %[[B:.*]], i32 0
return _mm_round_sd(x, y, 4);
}

__m128 test_mm_round_ss(__m128 x, __m128 y) {
// CHECK-LABEL: test_mm_round_ss
// CHECK: call {{.*}}<4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 4)
// CHECK: %[[A:.*]] = extractelement <4 x float> %{{.*}}, i32 0
// CHECK: %[[B:.*]] = call float @llvm.roundeven.f32(float %[[A:.*]])
// CHECK: %{{.*}} = insertelement <4 x float> %0, float %[[B:.*]], i32 0
return _mm_round_ss(x, y, 4);
}

Expand Down
38 changes: 20 additions & 18 deletions llvm/include/llvm/IR/IntrinsicsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -626,18 +626,20 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".

// FP rounding ops
let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_sse41_round_ss : ClangBuiltin<"__builtin_ia32_roundss">,
DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
def int_x86_sse41_round_ps : ClangBuiltin<"__builtin_ia32_roundps">,
DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
def int_x86_sse41_round_sd : ClangBuiltin<"__builtin_ia32_roundsd">,
DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>]>;
def int_x86_sse41_round_pd : ClangBuiltin<"__builtin_ia32_roundpd">,
DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
def int_x86_sse41_round_ss
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
def int_x86_sse41_round_ps
: DefaultAttrsIntrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
def int_x86_sse41_round_sd
: DefaultAttrsIntrinsic<[llvm_v2f64_ty],
[llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<2>>]>;
def int_x86_sse41_round_pd
: DefaultAttrsIntrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
}

// Vector min element
Expand Down Expand Up @@ -921,12 +923,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.".
def int_x86_avx_rcp_ps_256 : ClangBuiltin<"__builtin_ia32_rcpps256">,
DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;

def int_x86_avx_round_pd_256 : ClangBuiltin<"__builtin_ia32_roundpd256">,
DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
def int_x86_avx_round_ps_256 : ClangBuiltin<"__builtin_ia32_roundps256">,
DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
def int_x86_avx_round_pd_256
: DefaultAttrsIntrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
def int_x86_avx_round_ps_256
: DefaultAttrsIntrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty],
[IntrNoMem, ImmArg<ArgIndex<1>>]>;
}

// Horizontal ops
Expand Down