Fix rounding of Float32->Float16 near eps(Float16(0.0)) (#32444)

Keno · web-flow · commit 82883a0e48b0 · 2019-07-02T12:52:50.000-04:00
* Move F16 table to a better place

* Fix rounding of Float32-&gt;Float16 near eps(Float16(0.0))

The originally described algorithm worked only for round-to-zero,
and our adjustment to make it round-to-nearest was incorrect
for values that result in Float16 subnormals. Fix this at the expense
of an extra `|` and `&amp;`.
diff --git a/base/float.jl b/base/float.jl
@@ -137,16 +137,61 @@ function Float32(x::Int128)
     reinterpret(Float32, s | d + y)
 end
 
+# Float32 -> Float16 algorithm from:
+#   "Fast Half Float Conversion" by Jeroen van der Zijp
+#   ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
+#
+# With adjustments for round-to-nearest, ties to even.
+#
+let _basetable = Vector{UInt16}(undef, 512),
+    _shifttable = Vector{UInt8}(undef, 512)
+    for i = 0:255
+        e = i - 127
+        if e < -25  # Very small numbers map to zero
+            _basetable[i|0x000+1] = 0x0000
+            _basetable[i|0x100+1] = 0x8000
+            _shifttable[i|0x000+1] = 25
+            _shifttable[i|0x100+1] = 25
+        elseif e < -14  # Small numbers map to denorms
+            _basetable[i|0x000+1] = 0x0000
+            _basetable[i|0x100+1] = 0x8000
+            _shifttable[i|0x000+1] = -e-1
+            _shifttable[i|0x100+1] = -e-1
+        elseif e <= 15  # Normal numbers just lose precision
+            _basetable[i|0x000+1] = ((e+15)<<10)
+            _basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
+            _shifttable[i|0x000+1] = 13
+            _shifttable[i|0x100+1] = 13
+        elseif e < 128  # Large numbers map to Infinity
+            _basetable[i|0x000+1] = 0x7C00
+            _basetable[i|0x100+1] = 0xFC00
+            _shifttable[i|0x000+1] = 24
+            _shifttable[i|0x100+1] = 24
+        else  # Infinity and NaN's stay Infinity and NaN's
+            _basetable[i|0x000+1] = 0x7C00
+            _basetable[i|0x100+1] = 0xFC00
+            _shifttable[i|0x000+1] = 13
+            _shifttable[i|0x100+1] = 13
+        end
+    end
+    global const shifttable = (_shifttable...,)
+    global const basetable = (_basetable...,)
+end
+
 function Float16(val::Float32)
     f = reinterpret(UInt32, val)
     if isnan(val)
         t = 0x8000 ⊻ (0x8000 & ((f >> 0x10) % UInt16))
         return reinterpret(Float16, t ⊻ ((f >> 0xd) % UInt16))
     end
-    i = (f >> 23) & 0x1ff + 1
+    i = ((f & ~significand_mask(Float32)) >> significand_bits(Float32)) + 1
     @inbounds sh = shifttable[i]
-    f &= 0x007fffff
-    @inbounds h = (basetable[i] + (f >> sh)) % UInt16
+    f &= significand_mask(Float32)
+    # If `val` is subnormal, the tables are set up to force the
+    # result to 0, so the significand has an implicit `1` in the
+    # cases we care about.
+    f |= significand_mask(Float32) + 0x1
+    @inbounds h = (basetable[i] + (f >> sh) & significand_mask(Float16)) % UInt16
     # round
     # NOTE: we maybe should ignore NaNs here, but the payload is
     # getting truncated anyway so "rounding" it might not matter
@@ -202,45 +247,6 @@ function Float32(val::Float16)
     return reinterpret(Float32, ret)
 end
 
-# Float32 -> Float16 algorithm from:
-#   "Fast Half Float Conversion" by Jeroen van der Zijp
-#   ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
-
-let _basetable = Vector{UInt16}(undef, 512),
-    _shifttable = Vector{UInt8}(undef, 512)
-    for i = 0:255
-        e = i - 127
-        if e < -24  # Very small numbers map to zero
-            _basetable[i|0x000+1] = 0x0000
-            _basetable[i|0x100+1] = 0x8000
-            _shifttable[i|0x000+1] = 24
-            _shifttable[i|0x100+1] = 24
-        elseif e < -14  # Small numbers map to denorms
-            _basetable[i|0x000+1] = (0x0400>>(-e-14))
-            _basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
-            _shifttable[i|0x000+1] = -e-1
-            _shifttable[i|0x100+1] = -e-1
-        elseif e <= 15  # Normal numbers just lose precision
-            _basetable[i|0x000+1] = ((e+15)<<10)
-            _basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
-            _shifttable[i|0x000+1] = 13
-            _shifttable[i|0x100+1] = 13
-        elseif e < 128  # Large numbers map to Infinity
-            _basetable[i|0x000+1] = 0x7C00
-            _basetable[i|0x100+1] = 0xFC00
-            _shifttable[i|0x000+1] = 24
-            _shifttable[i|0x100+1] = 24
-        else  # Infinity and NaN's stay Infinity and NaN's
-            _basetable[i|0x000+1] = 0x7C00
-            _basetable[i|0x100+1] = 0xFC00
-            _shifttable[i|0x000+1] = 13
-            _shifttable[i|0x100+1] = 13
-        end
-    end
-    global const shifttable = (_shifttable...,)
-    global const basetable = (_basetable...,)
-end
-
 #convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)
 Float32(x::Float64) = fptrunc(Float32, x)
 Float16(x::Float64) = Float16(Float32(x))
@@ -867,6 +873,11 @@ exponent_one(::Type{Float16}) =     0x3c00
 exponent_half(::Type{Float16}) =    0x3800
 significand_mask(::Type{Float16}) = 0x03ff
 
+for T in (Float16, Float32, Float64)
+    @eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
+    @eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
+end
+
 # integer size of float
 uinttype(::Type{Float64}) = UInt64
 uinttype(::Type{Float32}) = UInt32
diff --git a/base/math.jl b/base/math.jl
@@ -21,7 +21,8 @@ import .Base: log, exp, sin, cos, tan, sinh, cosh, tanh, asin,
              exp10, expm1, log1p
 
 using .Base: sign_mask, exponent_mask, exponent_one,
-            exponent_half, uinttype, significand_mask
+            exponent_half, uinttype, significand_mask,
+            significand_bits, exponent_bits
 
 using Core.Intrinsics: sqrt_llvm
 
@@ -38,8 +39,6 @@ end
 end
 
 for T in (Float16, Float32, Float64)
-    @eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
-    @eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
     @eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
     # maximum float exponent
     @eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T))
diff --git a/test/float16.jl b/test/float16.jl
@@ -166,3 +166,14 @@ end
 
 # issue #17148
 @test rem(Float16(1.2), Float16(one(1.2))) == 0.20019531f0
+
+# issue #32441
+const f16eps2 = Float32(eps(Float16(0.0)))/2
+const minsubf16 = nextfloat(Float16(0.0))
+const minsubf16_32 = Float32(minsubf16)
+@test Float16(f16eps2) == Float16(0.0)
+@test Float16(nextfloat(f16eps2)) == minsubf16
+@test Float16(prevfloat(minsubf16_32)) == minsubf16
+# Ties to even, in this case up
+@test Float16(minsubf16_32 + f16eps2) == nextfloat(minsubf16)
+@test Float16(prevfloat(minsubf16_32 + f16eps2)) == minsubf16