Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions clang/include/clang/Basic/BuiltinsX86.td
Original file line number Diff line number Diff line change
Expand Up @@ -697,11 +697,13 @@ let Features = "avx2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
def gatherq_d : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int const *, _Vector<2, long long int>, _Vector<4, int>, _Constant char)">;
}

let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
let Features = "f16c",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
def vcvtps2ph : X86Builtin<"_Vector<8, short>(_Vector<4, float>, _Constant int)">;
}

let Features = "f16c", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
let Features = "f16c",
Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
def vcvtps2ph256 : X86Builtin<"_Vector<8, short>(_Vector<8, float>, _Constant int)">;
}

Expand Down
89 changes: 89 additions & 0 deletions clang/lib/AST/ByteCode/InterpBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3002,6 +3002,91 @@ static bool interp__builtin_vec_set(InterpState &S, CodePtr OpPC,
return true;
}

static bool interp__builtin_ia32_vcvtps2ph(InterpState &S, CodePtr OpPC,
const CallExpr *Call) {
// Arguments are: vector of floats, rounding immediate
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The input vector is of floats, but the output vector is of integers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, since this is converting single-precision (float32) values to half-precision (float16) which is stored by __m128i.

assert(Call->getNumArgs() == 2);

APSInt Imm = popToAPSInt(S, Call->getArg(1));
const Pointer &Src = S.Stk.pop<Pointer>();
const Pointer &Dst = S.Stk.peek<Pointer>();

assert(Src.getFieldDesc()->isPrimitiveArray());
assert(Dst.getFieldDesc()->isPrimitiveArray());

const auto *SrcVTy = Call->getArg(0)->getType()->castAs<VectorType>();
unsigned SrcNumElems = SrcVTy->getNumElements();
const auto *DstVTy = Call->getType()->castAs<VectorType>();
unsigned DstNumElems = DstVTy->getNumElements();

const llvm::fltSemantics &HalfSem =
S.getASTContext().getFloatTypeSemantics(S.getASTContext().HalfTy);

// imm[2] == 1 means use MXCSR rounding mode.
// In that case, we can only evaluate if the conversion is exact.
int ImmVal = Imm.getZExtValue();
bool UseMXCSR = (ImmVal & 4) != 0;

llvm::RoundingMode RM;
if (!UseMXCSR) {
switch (ImmVal & 3) {
case 0:
RM = llvm::RoundingMode::NearestTiesToEven;
break;
case 1:
RM = llvm::RoundingMode::TowardNegative;
break;
case 2:
RM = llvm::RoundingMode::TowardPositive;
break;
case 3:
RM = llvm::RoundingMode::TowardZero;
break;
default:
llvm_unreachable("Invalid immediate rounding mode");
}
} else {
// For MXCSR, we must check for exactness. We can use any rounding mode
// for the trial conversion since the result is the same if it's exact.
RM = llvm::RoundingMode::NearestTiesToEven;
}

QualType DstElemQT = Dst.getFieldDesc()->getElemQualType();
PrimType DstElemT = *S.getContext().classify(DstElemQT);

for (unsigned I = 0; I != SrcNumElems; ++I) {
Floating SrcVal = Src.elem<Floating>(I);
APFloat DstVal = SrcVal.getAPFloat();

bool LostInfo;
APFloat::opStatus St = DstVal.convert(HalfSem, RM, &LostInfo);

if (UseMXCSR && St != APFloat::opOK) {
S.FFDiag(S.Current->getSource(OpPC),
diag::note_constexpr_dynamic_rounding);
return false;
}

INT_TYPE_SWITCH_NO_BOOL(DstElemT, {
// Convert the destination value's bit pattern to an unsigned integer,
// then reconstruct the element using the target type's 'from' method.
uint64_t RawBits = DstVal.bitcastToAPInt().getZExtValue();
Dst.elem<T>(I) = T::from(RawBits);
});
}

// Zero out remaining elements if the destination has more elements
// (e.g., vcvtps2ph converting 4 floats to 8 shorts).
if (DstNumElems > SrcNumElems) {
for (unsigned I = SrcNumElems; I != DstNumElems; ++I) {
INT_TYPE_SWITCH_NO_BOOL(DstElemT, { Dst.elem<T>(I) = T::from(0); });
}
}

Dst.initializeAllElements();
return true;
}

bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
uint32_t BuiltinID) {
if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
Expand Down Expand Up @@ -3845,6 +3930,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
case X86::BI__builtin_ia32_insert128i256:
return interp__builtin_x86_insert_subvector(S, OpPC, Call, BuiltinID);

case clang::X86::BI__builtin_ia32_vcvtps2ph:
case clang::X86::BI__builtin_ia32_vcvtps2ph256:
return interp__builtin_ia32_vcvtps2ph(S, OpPC, Call);

case X86::BI__builtin_ia32_vec_ext_v4hi:
case X86::BI__builtin_ia32_vec_ext_v16qi:
case X86::BI__builtin_ia32_vec_ext_v8hi:
Expand Down
73 changes: 73 additions & 0 deletions clang/lib/AST/ExprConstant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12442,6 +12442,79 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {

return Success(APValue(Elems.data(), NumElems), E);
}

case clang::X86::BI__builtin_ia32_vcvtps2ph:
case clang::X86::BI__builtin_ia32_vcvtps2ph256: {
APValue SrcVec;
if (!EvaluateAsRValue(Info, E->getArg(0), SrcVec))
return false;

APSInt Imm;
if (!EvaluateInteger(E->getArg(1), Imm, Info))
return false;

const auto *SrcVTy = E->getArg(0)->getType()->castAs<VectorType>();
unsigned SrcNumElems = SrcVTy->getNumElements();
const auto *DstVTy = E->getType()->castAs<VectorType>();
unsigned DstNumElems = DstVTy->getNumElements();
QualType DstElemTy = DstVTy->getElementType();

const llvm::fltSemantics &HalfSem =
Info.Ctx.getFloatTypeSemantics(Info.Ctx.HalfTy);

int ImmVal = Imm.getZExtValue();
bool UseMXCSR = (ImmVal & 4) != 0;

llvm::RoundingMode RM;
if (!UseMXCSR) {
switch (ImmVal & 3) {
case 0:
RM = llvm::RoundingMode::NearestTiesToEven;
break;
case 1:
RM = llvm::RoundingMode::TowardNegative;
break;
case 2:
RM = llvm::RoundingMode::TowardPositive;
break;
case 3:
RM = llvm::RoundingMode::TowardZero;
break;
default:
llvm_unreachable("Invalid immediate rounding mode");
}
} else {
RM = llvm::RoundingMode::NearestTiesToEven;
}

SmallVector<APValue, 8> ResultElements;
ResultElements.reserve(DstNumElems);

for (unsigned I = 0; I < SrcNumElems; ++I) {
APFloat SrcVal = SrcVec.getVectorElt(I).getFloat();

bool LostInfo;
APFloat::opStatus St = SrcVal.convert(HalfSem, RM, &LostInfo);

if (UseMXCSR && St != APFloat::opOK) {
Info.FFDiag(E, diag::note_constexpr_dynamic_rounding);
return false;
}

APSInt DstInt(SrcVal.bitcastToAPInt(),
DstElemTy->isUnsignedIntegerOrEnumerationType());
ResultElements.push_back(APValue(DstInt));
}

if (DstNumElems > SrcNumElems) {
APSInt Zero = Info.Ctx.MakeIntValue(0, DstElemTy);
for (unsigned I = SrcNumElems; I < DstNumElems; ++I) {
ResultElements.push_back(APValue(Zero));
}
}

return Success(ResultElements, E);
}
}
}

Expand Down
69 changes: 63 additions & 6 deletions clang/test/CodeGen/X86/f16c-builtins.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,37 @@ __m128 test_mm_cvtph_ps(__m128i a) {
return _mm_cvtph_ps(a);
}

__m128i test_mm_cvtps_ph(__m128 a) {
// CHECK-LABEL: test_mm_cvtps_ph
// CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %{{.*}}, i32 0)
return _mm_cvtps_ph(a, 0);
}

// A value exactly halfway between 1.0 and the next representable FP16 number.
// In binary, its significand ends in ...000, followed by a tie-bit 1.
#define POS_HALFWAY (1.0f + 0.00048828125f) // 1.0 + 2^-11, a tie-breaking case

//
// _mm_cvtps_ph (128-bit, 4 floats -> 8 shorts, 4 are zero-padded)
//
// Test values: -2.5f, 1.123f, POS_HALFWAY
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
0xC100, 0x3C7E, 0x3C00, 0x0000, 0, 0, 0, 0
));
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
));
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
0xC100, 0x3C7E, 0x3C01, 0x0000, 0, 0, 0, 0
));
TEST_CONSTEXPR(match_v8hi(
_mm_cvtps_ph(_mm_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0, 0, 0, 0
));

__m256 test_mm256_cvtph_ps(__m128i a) {
// CHECK-LABEL: test_mm256_cvtph_ps
// CHECK: fpext <8 x half> %{{.*}} to <8 x float>
Expand All @@ -56,14 +87,40 @@ TEST_CONSTEXPR(match_m256(
1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 0.5f, -2.0f, 0.0f
));

__m128i test_mm_cvtps_ph(__m128 a) {
// CHECK-LABEL: test_mm_cvtps_ph
// CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %{{.*}}, i32 0)
return _mm_cvtps_ph(a, 0);
}
//
// _mm256_cvtps_ph (256-bit, 8 floats -> 8 shorts)
//
// Test values: -2.5f, 1.123f, POS_HALFWAY
TEST_CONSTEXPR(match_v8hi(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(style) Please can you position these just below the matching test_mm_cvtps_ph/test_mm256_cvtps_ph functions - it helps if we keep all the tests relevant to a specific intrinsic together.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEAREST_INT),
0xC100, 0x3C7E, 0x3C00, 0x0000, 0xC100, 0x3C7E, 0x3C00, 0x0000
));
TEST_CONSTEXPR(match_v8hi(
_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_NEG_INF),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
));
TEST_CONSTEXPR(match_v8hi(
_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_POS_INF),
0xC100, 0x3C7E, 0x3C01, 0x0000, 0xC100, 0x3C7E, 0x3C01, 0x0000
));
TEST_CONSTEXPR(match_v8hi(
_mm256_cvtps_ph(_mm256_setr_ps(-2.5f, 1.123f, POS_HALFWAY, 0.0f, -2.5f, 1.123f, POS_HALFWAY, 0.0f), _MM_FROUND_TO_ZERO),
0xC100, 0x3C7D, 0x3C00, 0x0000, 0xC100, 0x3C7D, 0x3C00, 0x0000
));

//
// Tests for Exact Dynamic Rounding
//
// Test that dynamic rounding SUCCEEDS for exactly representable values.
// We use _MM_FROUND_CUR_DIRECTION (value 4) to specify dynamic rounding.
// Inputs: -2.5f, 0.125f, -16.0f are all exactly representable in FP16.
TEST_CONSTEXPR(match_v8hi(
__builtin_ia32_vcvtps2ph256(_mm256_setr_ps(-2.5f, 0.125f, -16.0f, 0.0f, -2.5f, 0.125f, -16.0f, 0.0f), _MM_FROUND_CUR_DIRECTION),
0xC100, 0x3000, 0xCC00, 0x0000, 0xC100, 0x3000, 0xCC00, 0x0000
));

__m128i test_mm256_cvtps_ph(__m256 a) {
// CHECK-LABEL: test_mm256_cvtps_ph
// CHECK: call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %{{.*}}, i32 0)
return _mm256_cvtps_ph(a, 0);
}
}