Fixed float denorm conversion handling for XMConvertFloatToHalf (#114)

walbourn · web-flow · commit d0bbddc9f29a · 2020-06-25T15:08:32.000-07:00
diff --git a/Inc/DirectXPackedVector.h b/Inc/DirectXPackedVector.h
@@ -71,7 +71,7 @@ namespace DirectX
         //------------------------------------------------------------------------------
         // 16 bit floating point number consisting of a sign bit, a 5 bit biased
         // exponent, and a 10 bit mantissa
-        typedef uint16_t HALF;
+        using HALF = uint16_t;
 
         //------------------------------------------------------------------------------
         // 2D Vector; 16 bit floating point components
diff --git a/Inc/DirectXPackedVector.inl b/Inc/DirectXPackedVector.inl
@@ -387,8 +387,8 @@ inline HALF XMConvertFloatToHalf(float Value) noexcept
 {
 #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
     __m128 V1 = _mm_set_ss(Value);
-    __m128i V2 = _mm_cvtps_ph(V1, 0);
-    return static_cast<HALF>(_mm_cvtsi128_si32(V2));
+    __m128i V2 = _mm_cvtps_ph(V1, _MM_FROUND_TO_NEAREST_INT);
+    return static_cast<HALF>(_mm_extract_epi16(V2, 0));
 #elif defined(_XM_ARM_NEON_INTRINSICS_) && (defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || __aarch64__) && !defined(_XM_NO_INTRINSICS_)
     float32x4_t vFloat = vdupq_n_f32(Value);
     float16x4_t vHalf = vcvt_f16_f32(vFloat);
@@ -399,38 +399,29 @@ inline HALF XMConvertFloatToHalf(float Value) noexcept
     auto IValue = reinterpret_cast<uint32_t*>(&Value)[0];
     uint32_t Sign = (IValue & 0x80000000U) >> 16U;
     IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
-
-    if (IValue > 0x477FE000U)
+    if (IValue >= 0x47800000 /*e+16*/)
     {
-        // The number is too large to be represented as a half.  Saturate to infinity.
-        if (((IValue & 0x7F800000) == 0x7F800000) && ((IValue & 0x7FFFFF) != 0))
-        {
-            Result = 0x7FFF; // NAN
-        }
-        else
-        {
-            Result = 0x7C00U; // INF
-        }
+        // The number is too large to be represented as a half. Return infinity or NaN
+        Result = 0x7C00U | ((IValue > 0x7F800000) ? (0x200 | ((IValue >> 13U) & 0x3FFU)) : 0U);
     }
-    else if (!IValue)
+    else if (IValue <= 0x33000000U /*e-25*/)
     {
         Result = 0;
     }
+    else if (IValue < 0x38800000U /*e-14*/)
+    {
+        // The number is too small to be represented as a normalized half.
+        // Convert it to a denormalized value.
+        uint32_t Shift = 125U - (IValue >> 23U);
+        IValue = 0x800000U | (IValue & 0x7FFFFFU);
+        Result = IValue >> (Shift + 1);
+        uint32_t s = (IValue & ((1U << Shift) - 1)) != 0;
+        Result += (Result | s) & ((IValue >> Shift) & 1U);
+    }
     else
     {
-        if (IValue < 0x38800000U)
-        {
-            // The number is too small to be represented as a normalized half.
-            // Convert it to a denormalized value.
-            uint32_t Shift = 113U - (IValue >> 23U);
-            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
-        }
-        else
-        {
-            // Rebias the exponent to represent the value as a normalized half.
-            IValue += 0xC8000000U;
-        }
-
+        // Rebias the exponent to represent the value as a normalized half.
+        IValue += 0xC8000000U;
         Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU;
     }
     return static_cast<HALF>(Result | Sign);
@@ -477,7 +468,7 @@ inline HALF* XMConvertFloatToHalfStream
                         __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
                         pFloat += InputStride * 4;
 
-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
 
                         _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
                         pHalf += OutputStride * 4;
@@ -492,7 +483,7 @@ inline HALF* XMConvertFloatToHalfStream
                         __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
                         pFloat += InputStride * 4;
 
-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
 
                         _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
                         pHalf += OutputStride * 4;
@@ -510,7 +501,7 @@ inline HALF* XMConvertFloatToHalfStream
                         __m128 FV = _mm_load_ps(reinterpret_cast<const float*>(pFloat));
                         pFloat += InputStride * 4;
 
-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
 
                         *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
                         pHalf += OutputStride;
@@ -531,7 +522,7 @@ inline HALF* XMConvertFloatToHalfStream
                         __m128 FV = _mm_loadu_ps(reinterpret_cast<const float*>(pFloat));
                         pFloat += InputStride * 4;
 
-                        __m128i HV = _mm_cvtps_ph(FV, 0);
+                        __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
 
                         *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
                         pHalf += OutputStride;
@@ -567,7 +558,7 @@ inline HALF* XMConvertFloatToHalfStream
                 __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
                 FV = _mm_blend_ps(FV, FT, 0xC);
 
-                __m128i HV = _mm_cvtps_ph(FV, 0);
+                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
 
                 _mm_storel_epi64(reinterpret_cast<__m128i*>(pHalf), HV);
                 pHalf += OutputStride * 4;
@@ -595,7 +586,7 @@ inline HALF* XMConvertFloatToHalfStream
                 __m128 FT = _mm_blend_ps(FV3, FV4, 0x8);
                 FV = _mm_blend_ps(FV, FT, 0xC);
 
-                __m128i HV = _mm_cvtps_ph(FV, 0);
+                __m128i HV = _mm_cvtps_ph(FV, _MM_FROUND_TO_NEAREST_INT);
 
                 *reinterpret_cast<HALF*>(pHalf) = static_cast<HALF>(_mm_extract_epi16(HV, 0));
                 pHalf += OutputStride;
@@ -2099,7 +2090,7 @@ inline void XM_CALLCONV XMStoreHalf2
 {
     assert(pDestination);
 #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph(V, 0);
+    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
     _mm_store_ss(reinterpret_cast<float*>(pDestination), _mm_castsi128_ps(V1));
 #else
     pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
@@ -2655,7 +2646,7 @@ inline void XM_CALLCONV XMStoreHalf4
 {
     assert(pDestination);
 #if defined(_XM_F16C_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
-    __m128i V1 = _mm_cvtps_ph(V, 0);
+    __m128i V1 = _mm_cvtps_ph(V, _MM_FROUND_TO_NEAREST_INT);
     _mm_storel_epi64(reinterpret_cast<__m128i*>(pDestination), V1);
 #else
     XMFLOAT4A t;