Skip to content

Conversation

@ericxu233
Copy link
Contributor

@ericxu233 ericxu233 commented Oct 7, 2025

Fixes #160312

@ericxu233 ericxu233 force-pushed the constexpr-vcvtps2ph branch from 6c0574c to 5417390 Compare October 7, 2025 14:50
@llvmbot llvmbot added clang Clang issues not falling into any other category backend:X86 clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:bytecode Issues for the clang bytecode constexpr interpreter labels Oct 7, 2025
@llvmbot
Copy link
Member

llvmbot commented Oct 7, 2025

@llvm/pr-subscribers-backend-x86

Author: Hanyang (Eric) Xu (ericxu233)

Changes

Addresses #160312


Full diff: https://github.com/llvm/llvm-project/pull/162295.diff

4 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.td (+4-2)
  • (modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+89)
  • (modified) clang/lib/AST/ExprConstant.cpp (+75)
  • (modified) clang/test/CodeGen/X86/f16c-builtins.c (+57)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 41652259cf6a3..f4688901168f4 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -697,11 +697,13 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
   def gatherq_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<2, long long int>, _Vector<4, int>, _Constant char)">;
 }
 
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "f16c",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vcvtps2ph : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int)">;
 }
 
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "f16c",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 1eea813b8c556..613dcadaf13b0 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3002,6 +3002,91 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_vcvtps2ph(InterpState &S, CodePtr OpPC,
+                                           const CallExpr *Call) {
+  // Arguments are: vector of floats, rounding immediate
+  assert(Call->getNumArgs() == 2);
+
+  APSInt Imm = popToAPSInt(S, Call->getArg(1));
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  assert(Src.getFieldDesc()->isPrimitiveArray());
+  assert(Dst.getFieldDesc()->isPrimitiveArray());
+
+  const auto *SrcVTy = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned SrcNumElems = SrcVTy->getNumElements();
+  const auto *DstVTy = Call->getType()->castAs<VectorType>();
+  unsigned DstNumElems = DstVTy->getNumElements();
+
+  const llvm::fltSemantics &HalfSem =
+      S.getASTContext().getFloatTypeSemantics(S.getASTContext().HalfTy);
+
+  // imm[2] == 1 means use MXCSR rounding mode.
+  // In that case, we can only evaluate if the conversion is exact.
+  int ImmVal = Imm.getZExtValue();
+  bool UseMXCSR = (ImmVal & 4) != 0;
+
+  llvm::RoundingMode RM;
+  if (!UseMXCSR) {
+    switch (ImmVal & 3) {
+    case 0:
+      RM = llvm::RoundingMode::NearestTiesToEven;
+      break;
+    case 1:
+      RM = llvm::RoundingMode::TowardNegative;
+      break;
+    case 2:
+      RM = llvm::RoundingMode::TowardPositive;
+      break;
+    case 3:
+      RM = llvm::RoundingMode::TowardZero;
+      break;
+    default:
+      llvm_unreachable("Invalid immediate rounding mode");
+    }
+  } else {
+    // For MXCSR, we must check for exactness. We can use any rounding mode
+    // for the trial conversion since the result is the same if it's exact.
+    RM = llvm::RoundingMode::NearestTiesToEven;
+  }
+
+  QualType DstElemQT = Dst.getFieldDesc()->getElemQualType();
+  PrimType DstElemT = *S.getContext().classify(DstElemQT);
+  bool DstIsUnsigned = DstElemQT->isUnsignedIntegerOrEnumerationType();
+
+  for (unsigned I = 0; I < SrcNumElems; ++I) {
+    Floating SrcVal = Src.elem<Floating>(I);
+    APFloat DstVal = SrcVal.getAPFloat();
+
+    bool LostInfo;
+    APFloat::opStatus St = DstVal.convert(HalfSem, RM, &LostInfo);
+
+    if (UseMXCSR && St != APFloat::opOK) {
+      S.FFDiag(S.Current->getSource(OpPC),
+               diag::note_constexpr_dynamic_rounding);
+      return false;
+    }
+
+    INT_TYPE_SWITCH_NO_BOOL(DstElemT, {
+      // FIX: Extract the integer value before calling 'from'.
+      uint64_t RawBits = DstVal.bitcastToAPInt().getZExtValue();
+      Dst.elem<T>(I) = T::from(RawBits);
+    });
+  }
+
+  // Zero out remaining elements if the destination has more elements
+  // (e.g., vcvtps2ph converting 4 floats to 8 shorts).
+  if (DstNumElems > SrcNumElems) {
+    for (unsigned I = SrcNumElems; I < DstNumElems; ++I) {
+      INT_TYPE_SWITCH_NO_BOOL(DstElemT, { Dst.elem<T>(I) = T::from(0); });
+    }
+  }
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3845,6 +3930,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_insert128i256:
     return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);
 
+  case clang::X86::BI__builtin_ia32_vcvtps2ph:
+  case clang::X86::BI__builtin_ia32_vcvtps2ph256:
+    return interp__builtin_ia32_vcvtps2ph(S, OpPC, Call);
+
   case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v8hi:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 618e1636e9e53..0f709688b87c5 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12442,6 +12442,81 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(Elems.data(), NumElems), E);
   }
+
+  case clang::X86::BI__builtin_ia32_vcvtps2ph:
+  case clang::X86::BI__builtin_ia32_vcvtps2ph256: {
+    APValue SrcVec;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SrcVec))
+      return false;
+
+    APSInt Imm;
+    if (!EvaluateInteger(E->getArg(1), Imm, Info))
+      return false;
+
+    assert(SrcVec.isVector());
+
+    const auto *SrcVTy = E->getArg(0)->getType()->castAs<VectorType>();
+    unsigned SrcNumElems = SrcVTy->getNumElements();
+    const auto *DstVTy = E->getType()->castAs<VectorType>();
+    unsigned DstNumElems = DstVTy->getNumElements();
+    QualType DstElemTy = DstVTy->getElementType();
+
+    const llvm::fltSemantics &HalfSem =
+        Info.Ctx.getFloatTypeSemantics(Info.Ctx.HalfTy);
+
+    int ImmVal = Imm.getZExtValue();
+    bool UseMXCSR = (ImmVal & 4) != 0;
+
+    llvm::RoundingMode RM;
+    if (!UseMXCSR) {
+      switch (ImmVal & 3) {
+      case 0:
+        RM = llvm::RoundingMode::NearestTiesToEven;
+        break;
+      case 1:
+        RM = llvm::RoundingMode::TowardNegative;
+        break;
+      case 2:
+        RM = llvm::RoundingMode::TowardPositive;
+        break;
+      case 3:
+        RM = llvm::RoundingMode::TowardZero;
+        break;
+      default:
+        llvm_unreachable("Invalid immediate rounding mode");
+      }
+    } else {
+      RM = llvm::RoundingMode::NearestTiesToEven;
+    }
+
+    SmallVector<APValue, 8> ResultElements;
+    ResultElements.reserve(DstNumElems);
+
+    for (unsigned I = 0; I < SrcNumElems; ++I) {
+      APFloat SrcVal = SrcVec.getVectorElt(I).getFloat();
+
+      bool LostInfo;
+      APFloat::opStatus St = SrcVal.convert(HalfSem, RM, &LostInfo);
+
+      if (UseMXCSR && St != APFloat::opOK) {
+        Info.FFDiag(E, diag::note_constexpr_dynamic_rounding);
+        return false;
+      }
+
+      APSInt DstInt(SrcVal.bitcastToAPInt(),
+                    DstElemTy->isUnsignedIntegerOrEnumerationType());
+      ResultElements.push_back(APValue(DstInt));
+    }
+
+    if (DstNumElems > SrcNumElems) {
+      APSInt Zero = Info.Ctx.MakeIntValue(0, DstElemTy);
+      for (unsigned I = SrcNumElems; I < DstNumElems; ++I) {
+        ResultElements.push_back(APValue(Zero));
+      }
+    }
+
+    return Success(ResultElements, E);
+  }
   }
 }
 
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
index c08ef76d56981..de35c16c75ab4 100755
--- a/clang/test/CodeGen/X86/f16c-builtins.c
+++ b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -67,3 +67,60 @@ __m128i test_mm256_cvtps_ph(__m256 a) {
   // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %{{.*}}, i32 0)
   return _mm256_cvtps_ph(a, 0);
 }
+
+// A value exactly halfway between 1.0 and the next representable FP16 number.
+// In binary, its significand ends in ...000, followed by a tie-bit 1.
+#define POS_HALFWAY (1.0f + 0.00048828125f) // 1.0 + 2^-11, a tie-breaking case
+
+//
+// __builtin_ia32_vcvtps2ph (128-bit, 4 floats -> 8 shorts, 4 are zero-padded)
+//
+// Test values: -2.5f, 1.123f, POS_HALFWAY
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
+  0xC100, 0x3C7E, 0x3C00, 0x0000, 0, 0, 0, 0
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
+  0xC100, 0x3C7E, 0x3C01, 0x0000, 0, 0, 0, 0
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
+));
+
+//
+// __builtin_ia32_vcvtps2ph256 (256-bit, 8 floats -> 8 shorts)
+//
+// Test values: -2.5f, 1.123f, POS_HALFWAY
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
+  0xC100, 0x3C7E, 0x3C00, 0x0000, 0xC100, 0x3C7E, 0x3C00, 0x0000
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
+  0xC100, 0x3C7E, 0x3C01, 0x0000, 0xC100, 0x3C7E, 0x3C01, 0x0000
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
+));
+
+//
+// Tests for Exact Dynamic Rounding
+//
+// Test that dynamic rounding SUCCEEDS for exactly representable values.
+// We use _MM_FROUND_CUR_DIRECTION (value 4) to specify dynamic rounding.
+// Inputs: -2.5f, 0.125f, -16.0f are all exactly representable in FP16.
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 0.125f, -16.0f, 0.0f, -2.5f, 0.125f, -16.0f, 0.0f), _MM_FROUND_CUR_DIRECTION),
+  0xC100, 0x3000, 0xCC00, 0x0000, 0xC100, 0x3000, 0xCC00, 0x0000
+));
\ No newline at end of file

@llvmbot
Copy link
Member

llvmbot commented Oct 7, 2025

@llvm/pr-subscribers-clang

Author: Hanyang (Eric) Xu (ericxu233)

Changes

Addresses #160312


Full diff: https://github.com/llvm/llvm-project/pull/162295.diff

4 Files Affected:

  • (modified) clang/include/clang/Basic/BuiltinsX86.td (+4-2)
  • (modified) clang/lib/AST/ByteCode/InterpBuiltin.cpp (+89)
  • (modified) clang/lib/AST/ExprConstant.cpp (+75)
  • (modified) clang/test/CodeGen/X86/f16c-builtins.c (+57)
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 41652259cf6a3..f4688901168f4 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -697,11 +697,13 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
   def gatherq_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<2, long long int>, _Vector<4, int>, _Constant char)">;
 }
 
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
+let Features = "f16c",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
   def vcvtps2ph : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int)">;
 }
 
-let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
+let Features = "f16c",
+    Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
   def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
 }
 
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index 1eea813b8c556..613dcadaf13b0 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -3002,6 +3002,91 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_ia32_vcvtps2ph(InterpState &S, CodePtr OpPC,
+                                           const CallExpr *Call) {
+  // Arguments are: vector of floats, rounding immediate
+  assert(Call->getNumArgs() == 2);
+
+  APSInt Imm = popToAPSInt(S, Call->getArg(1));
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  assert(Src.getFieldDesc()->isPrimitiveArray());
+  assert(Dst.getFieldDesc()->isPrimitiveArray());
+
+  const auto *SrcVTy = Call->getArg(0)->getType()->castAs<VectorType>();
+  unsigned SrcNumElems = SrcVTy->getNumElements();
+  const auto *DstVTy = Call->getType()->castAs<VectorType>();
+  unsigned DstNumElems = DstVTy->getNumElements();
+
+  const llvm::fltSemantics &HalfSem =
+      S.getASTContext().getFloatTypeSemantics(S.getASTContext().HalfTy);
+
+  // imm[2] == 1 means use MXCSR rounding mode.
+  // In that case, we can only evaluate if the conversion is exact.
+  int ImmVal = Imm.getZExtValue();
+  bool UseMXCSR = (ImmVal & 4) != 0;
+
+  llvm::RoundingMode RM;
+  if (!UseMXCSR) {
+    switch (ImmVal & 3) {
+    case 0:
+      RM = llvm::RoundingMode::NearestTiesToEven;
+      break;
+    case 1:
+      RM = llvm::RoundingMode::TowardNegative;
+      break;
+    case 2:
+      RM = llvm::RoundingMode::TowardPositive;
+      break;
+    case 3:
+      RM = llvm::RoundingMode::TowardZero;
+      break;
+    default:
+      llvm_unreachable("Invalid immediate rounding mode");
+    }
+  } else {
+    // For MXCSR, we must check for exactness. We can use any rounding mode
+    // for the trial conversion since the result is the same if it's exact.
+    RM = llvm::RoundingMode::NearestTiesToEven;
+  }
+
+  QualType DstElemQT = Dst.getFieldDesc()->getElemQualType();
+  PrimType DstElemT = *S.getContext().classify(DstElemQT);
+  bool DstIsUnsigned = DstElemQT->isUnsignedIntegerOrEnumerationType();
+
+  for (unsigned I = 0; I < SrcNumElems; ++I) {
+    Floating SrcVal = Src.elem<Floating>(I);
+    APFloat DstVal = SrcVal.getAPFloat();
+
+    bool LostInfo;
+    APFloat::opStatus St = DstVal.convert(HalfSem, RM, &LostInfo);
+
+    if (UseMXCSR && St != APFloat::opOK) {
+      S.FFDiag(S.Current->getSource(OpPC),
+               diag::note_constexpr_dynamic_rounding);
+      return false;
+    }
+
+    INT_TYPE_SWITCH_NO_BOOL(DstElemT, {
+      // FIX: Extract the integer value before calling 'from'.
+      uint64_t RawBits = DstVal.bitcastToAPInt().getZExtValue();
+      Dst.elem<T>(I) = T::from(RawBits);
+    });
+  }
+
+  // Zero out remaining elements if the destination has more elements
+  // (e.g., vcvtps2ph converting 4 floats to 8 shorts).
+  if (DstNumElems > SrcNumElems) {
+    for (unsigned I = SrcNumElems; I < DstNumElems; ++I) {
+      INT_TYPE_SWITCH_NO_BOOL(DstElemT, { Dst.elem<T>(I) = T::from(0); });
+    }
+  }
+
+  Dst.initializeAllElements();
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3845,6 +3930,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_insert128i256:
     return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);
 
+  case clang::X86::BI__builtin_ia32_vcvtps2ph:
+  case clang::X86::BI__builtin_ia32_vcvtps2ph256:
+    return interp__builtin_ia32_vcvtps2ph(S, OpPC, Call);
+
   case X86::BI__builtin_ia32_vec_ext_v4hi:
   case X86::BI__builtin_ia32_vec_ext_v16qi:
   case X86::BI__builtin_ia32_vec_ext_v8hi:
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 618e1636e9e53..0f709688b87c5 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12442,6 +12442,81 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(Elems.data(), NumElems), E);
   }
+
+  case clang::X86::BI__builtin_ia32_vcvtps2ph:
+  case clang::X86::BI__builtin_ia32_vcvtps2ph256: {
+    APValue SrcVec;
+    if (!EvaluateAsRValue(Info, E->getArg(0), SrcVec))
+      return false;
+
+    APSInt Imm;
+    if (!EvaluateInteger(E->getArg(1), Imm, Info))
+      return false;
+
+    assert(SrcVec.isVector());
+
+    const auto *SrcVTy = E->getArg(0)->getType()->castAs<VectorType>();
+    unsigned SrcNumElems = SrcVTy->getNumElements();
+    const auto *DstVTy = E->getType()->castAs<VectorType>();
+    unsigned DstNumElems = DstVTy->getNumElements();
+    QualType DstElemTy = DstVTy->getElementType();
+
+    const llvm::fltSemantics &HalfSem =
+        Info.Ctx.getFloatTypeSemantics(Info.Ctx.HalfTy);
+
+    int ImmVal = Imm.getZExtValue();
+    bool UseMXCSR = (ImmVal & 4) != 0;
+
+    llvm::RoundingMode RM;
+    if (!UseMXCSR) {
+      switch (ImmVal & 3) {
+      case 0:
+        RM = llvm::RoundingMode::NearestTiesToEven;
+        break;
+      case 1:
+        RM = llvm::RoundingMode::TowardNegative;
+        break;
+      case 2:
+        RM = llvm::RoundingMode::TowardPositive;
+        break;
+      case 3:
+        RM = llvm::RoundingMode::TowardZero;
+        break;
+      default:
+        llvm_unreachable("Invalid immediate rounding mode");
+      }
+    } else {
+      RM = llvm::RoundingMode::NearestTiesToEven;
+    }
+
+    SmallVector<APValue, 8> ResultElements;
+    ResultElements.reserve(DstNumElems);
+
+    for (unsigned I = 0; I < SrcNumElems; ++I) {
+      APFloat SrcVal = SrcVec.getVectorElt(I).getFloat();
+
+      bool LostInfo;
+      APFloat::opStatus St = SrcVal.convert(HalfSem, RM, &LostInfo);
+
+      if (UseMXCSR && St != APFloat::opOK) {
+        Info.FFDiag(E, diag::note_constexpr_dynamic_rounding);
+        return false;
+      }
+
+      APSInt DstInt(SrcVal.bitcastToAPInt(),
+                    DstElemTy->isUnsignedIntegerOrEnumerationType());
+      ResultElements.push_back(APValue(DstInt));
+    }
+
+    if (DstNumElems > SrcNumElems) {
+      APSInt Zero = Info.Ctx.MakeIntValue(0, DstElemTy);
+      for (unsigned I = SrcNumElems; I < DstNumElems; ++I) {
+        ResultElements.push_back(APValue(Zero));
+      }
+    }
+
+    return Success(ResultElements, E);
+  }
   }
 }
 
diff --git a/clang/test/CodeGen/X86/f16c-builtins.c b/clang/test/CodeGen/X86/f16c-builtins.c
index c08ef76d56981..de35c16c75ab4 100755
--- a/clang/test/CodeGen/X86/f16c-builtins.c
+++ b/clang/test/CodeGen/X86/f16c-builtins.c
@@ -67,3 +67,60 @@ __m128i test_mm256_cvtps_ph(__m256 a) {
   // CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %{{.*}}, i32 0)
   return _mm256_cvtps_ph(a, 0);
 }
+
+// A value exactly halfway between 1.0 and the next representable FP16 number.
+// In binary, its significand ends in ...000, followed by a tie-bit 1.
+#define POS_HALFWAY (1.0f + 0.00048828125f) // 1.0 + 2^-11, a tie-breaking case
+
+//
+// __builtin_ia32_vcvtps2ph (128-bit, 4 floats -> 8 shorts, 4 are zero-padded)
+//
+// Test values: -2.5f, 1.123f, POS_HALFWAY
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
+  0xC100, 0x3C7E, 0x3C00, 0x0000, 0, 0, 0, 0
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
+  0xC100, 0x3C7E, 0x3C01, 0x0000, 0, 0, 0, 0
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
+));
+
+//
+// __builtin_ia32_vcvtps2ph256 (256-bit, 8 floats -> 8 shorts)
+//
+// Test values: -2.5f, 1.123f, POS_HALFWAY
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
+  0xC100, 0x3C7E, 0x3C00, 0x0000, 0xC100, 0x3C7E, 0x3C00, 0x0000
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
+  0xC100, 0x3C7E, 0x3C01, 0x0000, 0xC100, 0x3C7E, 0x3C01, 0x0000
+));
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
+  0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
+));
+
+//
+// Tests for Exact Dynamic Rounding
+//
+// Test that dynamic rounding SUCCEEDS for exactly representable values.
+// We use _MM_FROUND_CUR_DIRECTION (value 4) to specify dynamic rounding.
+// Inputs: -2.5f, 0.125f, -16.0f are all exactly representable in FP16.
+TEST_CONSTEXPR(match_v8hi(
+  __builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 0.125f, -16.0f, 0.0f, -2.5f, 0.125f, -16.0f, 0.0f), _MM_FROUND_CUR_DIRECTION),
+  0xC100, 0x3000, 0xCC00, 0x0000, 0xC100, 0x3000, 0xCC00, 0x0000
+));
\ No newline at end of file

@RKSimon RKSimon self-requested a review October 7, 2025 14:53
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think you can handle the avx512 vcvtps2ph*_mask builtins as well? It should mainly just be a bit of arg count twiddling

@ericxu233
Copy link
Contributor Author

@RKSimon

Do you think you can handle the avx512 vcvtps2ph*_mask builtins as well? It should main

Do I do all vcvtps2ph instructions? Some instructions are from AVX512_FP16 which have native support for half-precision floats. In order to support these instructions, I need to write new logic since the current logic only handles F16C-style half-precision floats which are short integers.

Maybe I can do the rest of the instructions, excluding the AVX512_FP16 instructions. And we can do the AVX512_FP16 instructions in the future?

@RKSimon
Copy link
Collaborator

RKSimon commented Oct 26, 2025

I just meant the AVX512F/VL variants vcvtps2ph_mask / vcvtps2ph256_mask / __builtin_ia32_vcvtps2ph512_mask - those should just require additional handling for the getNumArgs() == 4 masked variant.

But I'm happy for this just to focus on the F16C cases if you prefer to iterate in later PRs.

@ericxu233 ericxu233 force-pushed the constexpr-vcvtps2ph branch from bad6e80 to 1141c40 Compare November 6, 2025 06:43
@ericxu233
Copy link
Contributor Author

I just meant the AVX512F/VL variants vcvtps2ph_mask / vcvtps2ph256_mask / __builtin_ia32_vcvtps2ph512_mask - those should just require additional handling for the getNumArgs() == 4 masked variant.

But I'm happy for this just to focus on the F16C cases if you prefer to iterate in later PRs.

I apologize for the late response, and thanks for the flexibility! Yes, I would prefer to iterate in later PRs since I am a little bit pressed for time lately. Let me know if there are any additional items in the code that I need to address.

@RKSimon RKSimon self-requested a review November 6, 2025 16:45
@ericxu233
Copy link
Contributor Author

@RKSimon Ping for status. I've fixed this last review comment. Would this be ready to merge :)?

@RKSimon RKSimon self-requested a review November 10, 2025 19:24
@RKSimon
Copy link
Collaborator

RKSimon commented Nov 10, 2025

Sorry I'll take a look soon - until you request a review I tend to ignore PRs after I request changes

@RKSimon RKSimon requested a review from phoebewang November 11, 2025 11:25
@RKSimon
Copy link
Collaborator

RKSimon commented Nov 11, 2025

@phoebewang Are you OK with this? An alternative is we just allow exact rounding (which is what we'll end up doing for the SSE2 style conversions like CVTPD2DQ).

@phoebewang
Copy link
Contributor

@phoebewang Are you OK with this? An alternative is we just allow exact rounding (which is what we'll end up doing for the SSE2 style conversions like CVTPD2DQ).

Can we check for strictfp mode? I think we can assume NearestTiesToEven in non strictfp mode.

@RKSimon
Copy link
Collaborator

RKSimon commented Nov 17, 2025

@phoebewang Are you OK with this? An alternative is we just allow exact rounding (which is what we'll end up doing for the SSE2 style conversions like CVTPD2DQ).

Can we check for strictfp mode? I think we can assume NearestTiesToEven in non strictfp mode.

@ericxu233 ping?

@ericxu233
Copy link
Contributor Author

@phoebewang Are you OK with this? An alternative is we just allow exact rounding (which is what we'll end up doing for the SSE2 style conversions like CVTPD2DQ).

Can we check for strictfp mode? I think we can assume NearestTiesToEven in non strictfp mode.

@ericxu233 ping?

I am unsure of with means? Could you elaborate on this a little bit? I thought the rounding mode used was from the instructions. Do you mean that in non-strictfp mode and in MXCSR, I assume "NearestTiesToEven"?

default:
llvm_unreachable("Invalid immediate rounding mode");
}
} else {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean in this case, we need to check isFPConstrained before setting NearestTiesToEven.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Requires merge conflict fixes and strict fp checks

@ericxu233 ericxu233 requested a review from RKSimon November 27, 2025 05:39
@ericxu233
Copy link
Contributor Author

ericxu233 commented Nov 27, 2025

@RKSimon Ping for review again...added strict fp mode

@github-actions
Copy link

github-actions bot commented Nov 27, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@RKSimon RKSimon requested a review from phoebewang November 27, 2025 10:06
Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@phoebewang _MM_FROUND_CUR_DIRECTION now only fails in strict mode if we don't have a lossless truncation - is that what you were after?

@phoebewang
Copy link
Contributor

@phoebewang _MM_FROUND_CUR_DIRECTION now only fails in strict mode if we don't have a lossless truncation - is that what you were after?

Yes, LGTM.

Copy link
Collaborator

@RKSimon RKSimon left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@RKSimon RKSimon enabled auto-merge (squash) November 27, 2025 12:32
@RKSimon RKSimon merged commit eee09ca into llvm:main Nov 27, 2025
9 of 10 checks passed
augusto2112 pushed a commit to augusto2112/llvm-project that referenced this pull request Dec 3, 2025
kcloudy0717 pushed a commit to kcloudy0717/llvm-project that referenced this pull request Dec 4, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

backend:X86 clang:bytecode Issues for the clang bytecode constexpr interpreter clang:frontend Language frontend issues, e.g. anything involving "Sema" clang Clang issues not falling into any other category

Projects

None yet

Development

Successfully merging this pull request may close these issues.

[X86][Clang] VectorExprEvaluator::VisitCallExpr / InterpretBuiltin - allow F16C CVTPS2PH intrinsics to be used in constexpr

5 participants