@@ -140,19 +140,21 @@ end
140140# Float32 -> Float16 algorithm from:
141141# "Fast Half Float Conversion" by Jeroen van der Zijp
142142# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
143-
143+ #
144+ # With adjustments for round-to-nearest, ties to even.
145+ #
144146let _basetable = Vector {UInt16} (undef, 512 ),
145147 _shifttable = Vector {UInt8} (undef, 512 )
146148 for i = 0 : 255
147149 e = i - 127
148- if e < - 24 # Very small numbers map to zero
150+ if e < - 25 # Very small numbers map to zero
149151 _basetable[i| 0x000 + 1 ] = 0x0000
150152 _basetable[i| 0x100 + 1 ] = 0x8000
151- _shifttable[i| 0x000 + 1 ] = 24
152- _shifttable[i| 0x100 + 1 ] = 24
153+ _shifttable[i| 0x000 + 1 ] = 25
154+ _shifttable[i| 0x100 + 1 ] = 25
153155 elseif e < - 14 # Small numbers map to denorms
154- _basetable[i| 0x000 + 1 ] = ( 0x0400 >> ( - e - 14 ))
155- _basetable[i| 0x100 + 1 ] = ( 0x0400 >> ( - e - 14 )) | 0x8000
156+ _basetable[i| 0x000 + 1 ] = 0x0000
157+ _basetable[i| 0x100 + 1 ] = 0x8000
156158 _shifttable[i| 0x000 + 1 ] = - e- 1
157159 _shifttable[i| 0x100 + 1 ] = - e- 1
158160 elseif e <= 15 # Normal numbers just lose precision
@@ -182,10 +184,14 @@ function Float16(val::Float32)
182184 t = 0x8000 ⊻ (0x8000 & ((f >> 0x10 ) % UInt16))
183185 return reinterpret (Float16, t ⊻ ((f >> 0xd ) % UInt16))
184186 end
185- i = (f >> 23 ) & 0x1ff + 1
187+ i = ((f & ~ significand_mask (Float32)) >> significand_bits (Float32)) + 1
186188 @inbounds sh = shifttable[i]
187- f &= 0x007fffff
188- @inbounds h = (basetable[i] + (f >> sh)) % UInt16
189+ f &= significand_mask (Float32)
190+ # If `val` is subnormal, the tables are set up to force the
191+ # result to 0, so the significand has an implicit `1` in the
192+ # cases we care about.
193+ f |= significand_mask (Float32) + 0x1
194+ @inbounds h = (basetable[i] + (f >> sh) & significand_mask (Float16)) % UInt16
189195 # round
190196 # NOTE: we maybe should ignore NaNs here, but the payload is
191197 # getting truncated anyway so "rounding" it might not matter
@@ -867,6 +873,11 @@ exponent_one(::Type{Float16}) = 0x3c00
867873exponent_half (:: Type{Float16} ) = 0x3800
868874significand_mask (:: Type{Float16} ) = 0x03ff
869875
876+ for T in (Float16, Float32, Float64)
877+ @eval significand_bits (:: Type{$T} ) = $ (trailing_ones (significand_mask (T)))
878+ @eval exponent_bits (:: Type{$T} ) = $ (sizeof (T)* 8 - significand_bits (T) - 1 )
879+ end
880+
870881# integer size of float
871882uinttype (:: Type{Float64} ) = UInt64
872883uinttype (:: Type{Float32} ) = UInt32
0 commit comments