Skip to content

Commit 6acd11b

Browse files
authored
ConvertFloat32ToFloat16: Use DirectXMath conversion functions (microsoft#4855)
Custom half <-> float conversion functions had problems in multiple scenarios. This PR changes them into a wrapper, using the DirectXMath conversion functions instead.
1 parent 5c4d3b6 commit 6acd11b

File tree

2 files changed

+11
-85
lines changed

2 files changed

+11
-85
lines changed

include/dxc/Test/HlslTestUtils.h

Lines changed: 3 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -406,91 +406,9 @@ inline bool isnanFloat16(uint16_t val) {
406406
(val & FLOAT16_BIT_MANTISSA) != 0;
407407
}
408408

409-
inline uint16_t ConvertFloat32ToFloat16(float val) {
410-
union Bits {
411-
uint32_t u_bits;
412-
float f_bits;
413-
};
414-
415-
static const uint32_t SignMask = 0x8000;
416-
417-
// Minimum f32 value representable in f16 format without denormalizing
418-
static const uint32_t Min16in32 = 0x38800000;
419-
420-
// Maximum f32 value (next to infinity)
421-
static const uint32_t Max32 = 0x7f7FFFFF;
422-
423-
// Mask for f32 mantissa
424-
static const uint32_t Fraction32Mask = 0x007FFFFF;
425-
426-
// pow(2,24)
427-
static const uint32_t DenormalRatio = 0x4B800000;
428-
429-
static const uint32_t NormalDelta = 0x38000000;
430-
431-
Bits bits;
432-
bits.f_bits = val;
433-
uint32_t sign = bits.u_bits & (SignMask << 16);
434-
Bits Abs;
435-
Abs.u_bits = bits.u_bits ^ sign;
436-
437-
bool isLessThanNormal = Abs.f_bits < *(const float*)&Min16in32;
438-
bool isInfOrNaN = Abs.u_bits > Max32;
439-
440-
if (isLessThanNormal) {
441-
// Compute Denormal result
442-
return (uint16_t)(Abs.f_bits * *(const float*)(&DenormalRatio)) | (uint16_t)(sign >> 16);
443-
}
444-
else if (isInfOrNaN) {
445-
// Compute Inf or Nan result
446-
uint32_t Fraction = Abs.u_bits & Fraction32Mask;
447-
uint16_t IsNaN = Fraction == 0 ? 0 : 0xffff;
448-
return (IsNaN & FLOAT16_BIT_MANTISSA) | FLOAT16_BIT_EXP | (uint16_t)(sign >> 16);
449-
}
450-
else {
451-
// Compute Normal result
452-
return (uint16_t)((Abs.u_bits - NormalDelta) >> 13) | (uint16_t)(sign >> 16);
453-
}
454-
}
455-
456-
inline float ConvertFloat16ToFloat32(uint16_t x) {
457-
union Bits {
458-
float f_bits;
459-
uint32_t u_bits;
460-
};
461-
462-
uint32_t Sign = (x & FLOAT16_BIT_SIGN) << 16;
463-
464-
// nan -> exponent all set and mantisa is non zero
465-
// +/-inf -> exponent all set and mantissa is zero
466-
// denorm -> exponent zero and significand nonzero
467-
uint32_t Abs = (x & 0x7fff);
468-
uint32_t IsNormal = Abs > FLOAT16_BIGGEST_DENORM;
469-
uint32_t IsInfOrNaN = Abs > FLOAT16_BIGGEST_NORMAL;
470-
471-
// Signless Result for normals
472-
uint32_t DenormRatio = 0x33800000;
473-
float DenormResult = Abs * (*(float*)&DenormRatio);
474-
475-
uint32_t AbsShifted = Abs << 13;
476-
// Signless Result for normals
477-
uint32_t NormalResult = AbsShifted + 0x38000000;
478-
// Signless Result for int & nans
479-
uint32_t InfResult = AbsShifted + 0x70000000;
480-
481-
Bits bits;
482-
bits.u_bits = 0;
483-
if (IsInfOrNaN)
484-
bits.u_bits |= InfResult;
485-
else if (IsNormal)
486-
bits.u_bits |= NormalResult;
487-
else
488-
bits.f_bits = DenormResult;
489-
bits.u_bits |= Sign;
490-
return bits.f_bits;
491-
}
492-
uint16_t ConvertFloat32ToFloat16(float val);
493-
float ConvertFloat16ToFloat32(uint16_t val);
409+
// These are defined in ShaderOpTest.cpp using DirectXPackedVector functions.
410+
uint16_t ConvertFloat32ToFloat16(float val) throw();
411+
float ConvertFloat16ToFloat32(uint16_t val) throw();
494412

495413
inline bool CompareFloatULP(const float &fsrc, const float &fref, int ULPTolerance,
496414
hlsl::DXIL::Float32DenormMode mode = hlsl::DXIL::Float32DenormMode::Any) {

tools/clang/unittests/HLSL/ShaderOpTest.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232

3333
#include <stdlib.h>
3434
#include <DirectXMath.h>
35+
#include <DirectXPackedVector.h>
3536
#include <intsafe.h>
3637
#include <strsafe.h>
3738
#include <xmllite.h>
@@ -40,6 +41,13 @@
4041
///////////////////////////////////////////////////////////////////////////////
4142
// Useful helper functions.
4243

44+
uint16_t ConvertFloat32ToFloat16(float Value) throw() {
45+
return DirectX::PackedVector::XMConvertFloatToHalf(Value);
46+
}
47+
float ConvertFloat16ToFloat32(uint16_t Value) throw() {
48+
return DirectX::PackedVector::XMConvertHalfToFloat(Value);
49+
}
50+
4351
static st::OutputStringFn g_OutputStrFn;
4452
static void * g_OutputStrFnCtx;
4553

0 commit comments

Comments
 (0)