[Clang][X86] Replace F16C vcvtph2ps/256 intrinsics with __builtin_convertvector

moorabbit · moorabbit · commit ea2135f42f78 · 2025-08-10T07:29:05.000-04:00
The following intrinsics were replaced by a combination of `__builtin_shufflevector` and `__builtin_convertvector`: - `__builtin_ia32_vcvtph2ps` - `__builtin_ia32_vcvtph2ps256` Fixes #152749
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
@@ -757,14 +757,6 @@ let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
 }
 
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
-  def vcvtph2ps : X86Builtin<"_Vector<4, float>(_Vector<8, short>)">;
-}
-
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
-  def vcvtph2ps256 : X86Builtin<"_Vector<8, float>(_Vector<8, short>)">;
-}
-
 let Features = "rdrnd", Attributes = [NoThrow] in {
   def rdrand16_step : X86Builtin<"unsigned int(unsigned short *)">;
   def rdrand32_step : X86Builtin<"unsigned int(unsigned int *)">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/X86.cpp b/clang/lib/CodeGen/TargetBuiltins/X86.cpp
@@ -2841,8 +2841,6 @@ Value *CodeGenFunction::EmitX86BuiltinExpr(unsigned BuiltinID,
     return getCmpIntrinsicCall(Intrinsic::x86_sse2_cmp_sd, 7);
 
   // f16c half2float intrinsics
-  case X86::BI__builtin_ia32_vcvtph2ps:
-  case X86::BI__builtin_ia32_vcvtph2ps256:
   case X86::BI__builtin_ia32_vcvtph2ps_mask:
   case X86::BI__builtin_ia32_vcvtph2ps256_mask:
   case X86::BI__builtin_ia32_vcvtph2ps512_mask: {
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
@@ -40,12 +40,17 @@ typedef signed char __v16qs __attribute__((__vector_size__(16)));
 
 #ifdef __SSE2__
 /* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v4hf __attribute__((__vector_size__(8)));
 typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
 typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
 typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
 
 typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
 typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#else
+/* Use __fp16 when _Float16 is not supported. */
+typedef __fp16 __v4hf __attribute__((__vector_size__(8)));
+typedef __fp16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
 #endif
 
 /* Define the default attributes for the functions in this file. */
diff --git a/clang/lib/Headers/f16cintrin.h b/clang/lib/Headers/f16cintrin.h
@@ -39,7 +39,8 @@ static __inline float __DEFAULT_FN_ATTRS128
 _cvtsh_ss(unsigned short __a)
 {
   __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};
-  __v4sf __r = __builtin_ia32_vcvtph2ps(__v);
+  __v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3);
+  __v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf);
   return __r[0];
 }
 
@@ -109,7 +110,8 @@ _cvtsh_ss(unsigned short __a)
 static __inline __m128 __DEFAULT_FN_ATTRS128
 _mm_cvtph_ps(__m128i __a)
 {
-  return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);
+  __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);
+  return __builtin_convertvector((__v4hf)__v, __v4sf);
 }
 
 /// Converts a 256-bit vector of [8 x float] into a 128-bit vector
@@ -153,7 +155,7 @@ _mm_cvtph_ps(__m128i __a)
 static __inline __m256 __DEFAULT_FN_ATTRS256
 _mm256_cvtph_ps(__m128i __a)
 {
-  return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);
+  return __builtin_convertvector((__v8hf)__a, __v8sf);
 }
 
 #undef __DEFAULT_FN_ATTRS128
diff --git a/clang/test/CodeGen/X86/f16c-builtins-constrained.c b/clang/test/CodeGen/X86/f16c-builtins-constrained.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: call <4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   return _cvtsh_ss(a);
@@ -38,7 +38,7 @@ unsigned short test_cvtss_sh(float a) {
 
 __m128 test_mm_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm_cvtph_ps
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: call {{.*}}<4 x float> @llvm.experimental.constrained.fpext.v4f32.v4f16(<4 x half> %{{.*}}, metadata !"fpexcept.strict")
   return _mm_cvtph_ps(a);
 }
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -16,7 +16,7 @@ float test_cvtsh_ss(unsigned short a) {
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
   // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: fpext <4 x half> %{{.*}} to <4 x float>
   // CHECK: extractelement <4 x float> %{{.*}}, i32 0
   return _cvtsh_ss(a);
@@ -35,7 +35,7 @@ unsigned short test_cvtss_sh(float a) {
 
 __m128 test_mm_cvtph_ps(__m128i a) {
   // CHECK-LABEL: test_mm_cvtph_ps
-  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   // CHECK: fpext <4 x half> %{{.*}} to <4 x float>
   return _mm_cvtph_ps(a);
 }

Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,8 @@ static __inline float __DEFAULT_FN_ATTRS128`
`39`	`39`	`_cvtsh_ss(unsigned short __a)`
`40`	`40`	`{`
`41`	`41`	`__v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0};`
`42`		`- __v4sf __r = __builtin_ia32_vcvtph2ps(__v);`
	`42`	`+ __v4hi __w = __builtin_shufflevector(__v, __v, 0, 1, 2, 3);`
	`43`	`+ __v4sf __r = __builtin_convertvector((__v4hf)__w, __v4sf);`
`43`	`44`	`return __r[0];`
`44`	`45`	`}`
`45`	`46`
`@@ -109,7 +110,8 @@ _cvtsh_ss(unsigned short __a)`
`109`	`110`	`static __inline __m128 __DEFAULT_FN_ATTRS128`
`110`	`111`	`_mm_cvtph_ps(__m128i __a)`
`111`	`112`	`{`
`112`		`- return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a);`
	`113`	`+ __v4hi __v = __builtin_shufflevector((__v8hi)__a, (__v8hi)__a, 0, 1, 2, 3);`
	`114`	`+ return __builtin_convertvector((__v4hf)__v, __v4sf);`
`113`	`115`	`}`
`114`	`116`
`115`	`117`	`/// Converts a 256-bit vector of [8 x float] into a 128-bit vector`
`@@ -153,7 +155,7 @@ _mm_cvtph_ps(__m128i __a)`
`153`	`155`	`static __inline __m256 __DEFAULT_FN_ATTRS256`
`154`	`156`	`_mm256_cvtph_ps(__m128i __a)`
`155`	`157`	`{`
`156`		`- return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a);`
	`158`	`+ return __builtin_convertvector((__v8hf)__a, __v8sf);`
`157`	`159`	`}`
`158`	`160`
`159`	`161`	`#undef __DEFAULT_FN_ATTRS128`