Skip to content

[Clang][X86] Replace F16C vcvtph2ps/256 intrinsics with (convert|shuffle)vector builtins #152911

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Aug 12, 2025
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -757,14 +757,6 @@ let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
}

let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
}

let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
}

let Features = "rdrnd", Attributes = [NoThrow] in {
def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
Expand Down
2 changes: 0 additions & 2 deletions clang/lib/CodeGen/TargetBuiltins/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2841,8 +2841,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);

// f16c half2float intrinsics
case X86::BI__builtin_ia32_vcvtph2ps:
case X86::BI__builtin_ia32_vcvtph2ps256:
case X86::BI__builtin_ia32_vcvtph2ps_mask:
case X86::BI__builtin_ia32_vcvtph2ps256_mask:
case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
Expand Down
5 changes: 5 additions & 0 deletions clang/lib/Headers/emmintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,17 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));

#ifdef __SSE2__
/* Both _Float16 and __bf16 require SSE2 being enabled. */
typedef _Float16 __v4hf __attribute__((__vector_size__(8)));
typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));

typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
#else
/* Use __fp16 when _Float16 is not supported. */
typedef __fp16 __v4hf __attribute__((__vector_size__(8)));
typedef __fp16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand why this is necessary?

Copy link
Contributor Author

@moorabbit moorabbit Aug 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some tests were failing due to _Float16 not being supported on i686-*.
Maybe it's better to just add the -target-feature +sse2 flag in the failing tests to force support of _Float16?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We intend request SSE2 for _Float16 type.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would we be better off just defining the types inside the intrinsics?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That sounds better. Thx!

#endif

/* Define the default attributes for the functions in this file. */
Expand Down
8 changes: 5 additions & 3 deletions clang/lib/Headers/f16cintrin.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ static __inline float __DEFAULT_FN_ATTRS128
_cvtsh_ss(unsigned short __a)
{
__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
__v4sf __r = __builtin_ia32_vcvtph2ps(__v);
__v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3);
__v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf);
return __r[0];
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work consistently? I haven't properly compared the final asm at different -O levels.

float _cvtsh_ss(unsigned short __a)
{
  return (float)__builtin_bit_cast(_Float16, __a);
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thx, LGTM: https://godbolt.org/z/Pcr7aYeKE.
I wasn't aware of the builtin.

}

Expand Down Expand Up @@ -109,7 +110,8 @@ _cvtsh_ss(unsigned short __a)
static __inline __m128 __DEFAULT_FN_ATTRS128
_mm_cvtph_ps(__m128i __a)
{
return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
__v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);
return __builtin_convertvector((__v4hf)__v, __v4sf);
}

/// Converts a 256-bit vector of [8 x float] into a 128-bit vector
Expand Down Expand Up @@ -153,7 +155,7 @@ _mm_cvtph_ps(__m128i __a)
static __inline __m256 __DEFAULT_FN_ATTRS256
_mm256_cvtph_ps(__m128i __a)
{
return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
return __builtin_convertvector((__v8hf)__a, __v8sf);
}

#undef __DEFAULT_FN_ATTRS128
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/X86/f16c-builtins-constrained.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
Expand All @@ -38,7 +38,7 @@ unsigned short test_cvtss_sh(float a) {

__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: call {{.*}}<4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
return _mm_cvtph_ps(a);
}
Expand Down
4 changes: 2 additions & 2 deletions clang/test/CodeGen/X86/f16c-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
// CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
// CHECK: extractelement <4 x float> %{{.*}}, i32 0
return _cvtsh_ss(a);
Expand All @@ -35,7 +35,7 @@ unsigned short test_cvtss_sh(float a) {

__m128 test_mm_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm_cvtph_ps
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
// CHECK: fpext <4 x half> %{{.*}} to <4 x float>
return _mm_cvtph_ps(a);
}
Expand Down