Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -716,11 +716,13 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def gatherq_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<2, long long int>, _Vector<4, int>, _Constant char)">;
}

let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
let Features = "f16c",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vcvtps2ph : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int)">;
}

let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
let Features = "f16c",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
}

Expand Down
91 changes: 91 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3527,7 +3527,94 @@ static bool interp__builtin_ia32_shufbitqmb_mask(InterpState &S, CodePtr OpPC,
}

pushInteger(S, RetMask, Call->getType());
return true;
}

static bool interp__builtin_ia32_vcvtps2ph(InterpState &S, CodePtr OpPC,
const CallExpr *Call) {
// Arguments are: vector of floats, rounding immediate
assert(Call->getNumArgs() == 2);

APSInt Imm = popToAPSInt(S, Call->getArg(1));
const Pointer &Src = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

assert(Src.getFieldDesc()->isPrimitiveArray());
assert(Dst.getFieldDesc()->isPrimitiveArray());

const auto *SrcVTy = Call->getArg(0)->getType()->castAs<VectorType>();
unsigned SrcNumElems = SrcVTy->getNumElements();
const auto *DstVTy = Call->getType()->castAs<VectorType>();
unsigned DstNumElems = DstVTy->getNumElements();

const llvm::fltSemantics &HalfSem =
S.getASTContext().getFloatTypeSemantics(S.getASTContext().HalfTy);

// imm[2] == 1 means use MXCSR rounding mode.
// In that case, we can only evaluate if the conversion is exact.
int ImmVal = Imm.getZExtValue();
bool UseMXCSR = (ImmVal & 4) != 0;
bool IsFPConstrained =
Call->getFPFeaturesInEffect(S.getASTContext().getLangOpts())
.isFPConstrained();

llvm::RoundingMode RM;
if (!UseMXCSR) {
switch (ImmVal & 3) {
case 0:
RM = llvm::RoundingMode::NearestTiesToEven;
break;
case 1:
RM = llvm::RoundingMode::TowardNegative;
break;
case 2:
RM = llvm::RoundingMode::TowardPositive;
break;
case 3:
RM = llvm::RoundingMode::TowardZero;
break;
default:
llvm_unreachable("Invalid immediate rounding mode");
}
} else {
// For MXCSR, we must check for exactness. We can use any rounding mode
// for the trial conversion since the result is the same if it's exact.
RM = llvm::RoundingMode::NearestTiesToEven;
}

QualType DstElemQT = Dst.getFieldDesc()->getElemQualType();
PrimType DstElemT = *S.getContext().classify(DstElemQT);

for (unsigned I = 0; I != SrcNumElems; ++I) {
Floating SrcVal = Src.elem<Floating>(I);
APFloat DstVal = SrcVal.getAPFloat();

bool LostInfo;
APFloat::opStatus St = DstVal.convert(HalfSem, RM, &LostInfo);

if (UseMXCSR && IsFPConstrained && St != APFloat::opOK) {
S.FFDiag(S.Current->getSource(OpPC),
diag::note_constexpr_dynamic_rounding);
return false;
}

INT_TYPE_SWITCH_NO_BOOL(DstElemT, {
// Convert the destination value's bit pattern to an unsigned integer,
// then reconstruct the element using the target type's 'from' method.
uint64_t RawBits = DstVal.bitcastToAPInt().getZExtValue();
Dst.elem<T>(I) = T::from(RawBits);
});
}

// Zero out remaining elements if the destination has more elements
// (e.g., vcvtps2ph converting 4 floats to 8 shorts).
if (DstNumElems > SrcNumElems) {
for (unsigned I = SrcNumElems; I != DstNumElems; ++I) {
INT_TYPE_SWITCH_NO_BOOL(DstElemT, { Dst.elem<T>(I) = T::from(0); });
}
}

Dst.initializeAllElements();
return true;
}

Expand Down Expand Up @@ -4898,6 +4985,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_insert128i256:
return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);

case clang::X86::BI__builtin_ia32_vcvtps2ph:
case clang::X86::BI__builtin_ia32_vcvtps2ph256:
return interp__builtin_ia32_vcvtps2ph(S, OpPC, Call);

case X86::BI__builtin_ia32_vec_ext_v4hi:
case X86::BI__builtin_ia32_vec_ext_v16qi:
case X86::BI__builtin_ia32_vec_ext_v8hi:
Expand Down
75 changes: 75 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13870,6 +13870,81 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
return false;
return Success(R, E);
}

case clang::X86::BI__builtin_ia32_vcvtps2ph:
case clang::X86::BI__builtin_ia32_vcvtps2ph256: {
APValue SrcVec;
if (!EvaluateAsRValue(Info, E->getArg(0), SrcVec))
return false;

APSInt Imm;
if (!EvaluateInteger(E->getArg(1), Imm, Info))
return false;

const auto *SrcVTy = E->getArg(0)->getType()->castAs<VectorType>();
unsigned SrcNumElems = SrcVTy->getNumElements();
const auto *DstVTy = E->getType()->castAs<VectorType>();
unsigned DstNumElems = DstVTy->getNumElements();
QualType DstElemTy = DstVTy->getElementType();

const llvm::fltSemantics &HalfSem =
Info.Ctx.getFloatTypeSemantics(Info.Ctx.HalfTy);

int ImmVal = Imm.getZExtValue();
bool UseMXCSR = (ImmVal & 4) != 0;
bool IsFPConstrained =
E->getFPFeaturesInEffect(Info.Ctx.getLangOpts()).isFPConstrained();

llvm::RoundingMode RM;
if (!UseMXCSR) {
switch (ImmVal & 3) {
case 0:
RM = llvm::RoundingMode::NearestTiesToEven;
break;
case 1:
RM = llvm::RoundingMode::TowardNegative;
break;
case 2:
RM = llvm::RoundingMode::TowardPositive;
break;
case 3:
RM = llvm::RoundingMode::TowardZero;
break;
default:
llvm_unreachable("Invalid immediate rounding mode");
}
} else {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean in this case, we need to check isFPConstrained before setting NearestTiesToEven.

RM = llvm::RoundingMode::NearestTiesToEven;
}

SmallVector<APValue, 8> ResultElements;
ResultElements.reserve(DstNumElems);

for (unsigned I = 0; I < SrcNumElems; ++I) {
APFloat SrcVal = SrcVec.getVectorElt(I).getFloat();

bool LostInfo;
APFloat::opStatus St = SrcVal.convert(HalfSem, RM, &LostInfo);

if (UseMXCSR && IsFPConstrained && St != APFloat::opOK) {
Info.FFDiag(E, diag::note_constexpr_dynamic_rounding);
return false;
}

APSInt DstInt(SrcVal.bitcastToAPInt(),
DstElemTy->isUnsignedIntegerOrEnumerationType());
ResultElements.push_back(APValue(DstInt));
}

if (DstNumElems > SrcNumElems) {
APSInt Zero = Info.Ctx.MakeIntValue(0, DstElemTy);
for (unsigned I = SrcNumElems; I < DstNumElems; ++I) {
ResultElements.push_back(APValue(Zero));
}
}

return Success(ResultElements, E);
}
}
}

Expand Down
57 changes: 57 additions & 0 deletions clang/test/CodeGen/X86/f16c-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,31 @@ __m128 test_mm_cvtph_ps(__m128i a) {
return _mm_cvtph_ps(a);
}

// A value exactly halfway between 1.0 and the next representable FP16 number.
// In binary, its significand ends in ...000, followed by a tie-bit 1.
#define POS_HALFWAY (1.0f + 0.00048828125f) // 1.0 + 2^-11, a tie-breaking case

//
// _mm_cvtps_ph (128-bit, 4 floats -> 8 shorts, 4 are zero-padded)
//
// Test values: -2.5f, 1.123f, POS_HALFWAY
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
0xC100, 0x3C7E, 0x3C00, 0x0000, 0, 0, 0, 0
));
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
));
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
0xC100, 0x3C7E, 0x3C01, 0x0000, 0, 0, 0, 0
));
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
));

__m256 test_mm256_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm256_cvtph_ps
// CHECK: fpext <8 x half> %{{.*}} to <8 x float>
Expand All @@ -56,12 +81,44 @@ TEST_CONSTEXPR(match_m256(
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 0.5f, -2.0f, 0.0f
));

//
// _mm256_cvtps_ph (256-bit, 8 floats -> 8 shorts)
//
// Test values: -2.5f, 1.123f, POS_HALFWAY
TEST_CONSTEXPR(match_v8hi(
_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
0xC100, 0x3C7E, 0x3C00, 0x0000, 0xC100, 0x3C7E, 0x3C00, 0x0000
));
TEST_CONSTEXPR(match_v8hi(
_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
));
TEST_CONSTEXPR(match_v8hi(
_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
0xC100, 0x3C7E, 0x3C01, 0x0000, 0xC100, 0x3C7E, 0x3C01, 0x0000
));
TEST_CONSTEXPR(match_v8hi(
_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
));

__m128i test_mm_cvtps_ph(__m128 a) {
// CHECK-LABEL: test_mm_cvtps_ph
// CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %{{.*}}, i32 0)
return _mm_cvtps_ph(a, 0);
}

//
// Tests for Exact Dynamic Rounding
//
// Test that dynamic rounding SUCCEEDS for exactly representable values.
// We use _MM_FROUND_CUR_DIRECTION (value 4) to specify dynamic rounding.
// Inputs: -2.5f, 0.125f, -16.0f are all exactly representable in FP16.
TEST_CONSTEXPR(match_v8hi(
__builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 0.125f, -16.0f, 0.0f, -2.5f, 0.125f, -16.0f, 0.0f), _MM_FROUND_CUR_DIRECTION),
0xC100, 0x3000, 0xCC00, 0x0000, 0xC100, 0x3000, 0xCC00, 0x0000
));

__m128i test_mm256_cvtps_ph(__m256 a) {
// CHECK-LABEL: test_mm256_cvtps_ph
// CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %{{.*}}, i32 0)
Expand Down